test/simulate_nccl_errors.py - platform/external/pytorch - Git at Google

 import argparse
 import logging
 import os

 import torch
 import torch.distributed as c10d


 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )

 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Simple script to simulate NCCL errors. The script is "
         "supposed to be run on multiple different nodes simultaneously with "
         "appropriate rank and world_size. The script run an allreduce() on "
         "the rank 0 node and aborts all the other nodes to simulate an error "
         "in NCCL"
     )
     parser.add_argument("addr", help="address of the master node to connect to.")
     parser.add_argument("port", help="port of the master node to connect to.")
     parser.add_argument("rank", help="rank of this node")
     parser.add_argument("world_size", help="number of nodes in process group")
     args = parser.parse_args()
     rank = int(args.rank)
     world_size = int(args.world_size)
     port = int(args.port)

     store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
     process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
     logging.info("Running first allreduce")
     process_group.allreduce(torch.rand(10).cuda(rank)).wait()
     if rank == 0:
         logging.info("Running second allreduce only on rank 0")
         work = process_group.allreduce(torch.rand(10).cuda(rank))
         logging.info("Waiting for allreduce to complete...")
         work.wait()
         logging.info("Second allreduce successful: %s", work.is_success())
     else:
         logging.info("Aborting all other ranks.")
         os.abort()
	import argparse
	import logging
	import os

	import torch
	import torch.distributed as c10d


	logging.basicConfig(
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
	)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Simple script to simulate NCCL errors. The script is "
	"supposed to be run on multiple different nodes simultaneously with "
	"appropriate rank and world_size. The script run an allreduce() on "
	"the rank 0 node and aborts all the other nodes to simulate an error "
	"in NCCL"
	)
	parser.add_argument("addr", help="address of the master node to connect to.")
	parser.add_argument("port", help="port of the master node to connect to.")
	parser.add_argument("rank", help="rank of this node")
	parser.add_argument("world_size", help="number of nodes in process group")
	args = parser.parse_args()
	rank = int(args.rank)
	world_size = int(args.world_size)
	port = int(args.port)

	store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
	process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
	logging.info("Running first allreduce")
	process_group.allreduce(torch.rand(10).cuda(rank)).wait()
	if rank == 0:
	logging.info("Running second allreduce only on rank 0")
	work = process_group.allreduce(torch.rand(10).cuda(rank))
	logging.info("Waiting for allreduce to complete...")
	work.wait()
	logging.info("Second allreduce successful: %s", work.is_success())
	else:
	logging.info("Aborting all other ranks.")
	os.abort()