Python torch.distributed.init_process_group() Examples

The following are 30 code examples of torch.distributed.init_process_group(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.distributed , or try the search function

Example #1

Source File: predict_gpu.py From helen with MIT License

6 votes

def setup(rank, total_callers, args, all_input_files, all_devices):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=total_callers)

    # expand the arguments
    output_filepath, model_path, batch_size, num_workers = args

    # call prediction function
    predict(all_input_files[rank],
            output_filepath,
            model_path,
            batch_size,
            num_workers,
            rank,
            all_devices[rank])
    cleanup()

Example #2

Source File: predict_cpu.py From helen with MIT License

6 votes

def setup(rank, total_callers, args, all_input_files):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=total_callers)

    # expand the arguments
    output_filepath, model_path, batch_size, num_workers, threads = args

    # call prediction function
    predict(all_input_files[rank],
            output_filepath,
            model_path,
            batch_size,
            num_workers,
            rank,
            threads)
    cleanup()

Example #3

Source File: ray_container.py From adeptRL with GNU General Public License v3.0

6 votes

def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)
        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if worker_rank == rank:
                    groups[learner_rank] = g
        dist.new_group(learner_ranks)

        self.groups = groups
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank
        self.network = torch.zeros(3).to(self.device)
        self.exp = None
        self.network_handle = None

Example #4

Source File: train.py From gpt-2-output-dataset with MIT License

6 votes

def setup_distributed(port=29500):
    if not dist.is_available() or not torch.cuda.is_available() or torch.cuda.device_count() <= 1:
        return 0, 1

    if 'MPIR_CVAR_CH3_INTERFACE_HOSTNAME' in os.environ:
        from mpi4py import MPI
        mpi_rank = MPI.COMM_WORLD.Get_rank()
        mpi_size = MPI.COMM_WORLD.Get_size()

        os.environ["MASTER_ADDR"] = '127.0.0.1'
        os.environ["MASTER_PORT"] = str(port)

        dist.init_process_group(backend="nccl", world_size=mpi_size, rank=mpi_rank)
        return mpi_rank, mpi_size

    dist.init_process_group(backend="nccl", init_method="env://")
    return dist.get_rank(), dist.get_world_size()

Example #5

Source File: distributed.py From Single-Path-NAS-PyTorch with Apache License 2.0

6 votes

def init_process_group(backend):

    comm = MPI.COMM_WORLD
    world_size = comm.Get_size()
    rank = comm.Get_rank()

    info = dict()
    if rank == 0:
        host = socket.gethostname()
        address = socket.gethostbyname(host)
        info.update(dict(MASTER_ADDR=address, MASTER_PORT='1234'))

    info = comm.bcast(info, root=0)
    info.update(dict(WORLD_SIZE=str(world_size), RANK=str(rank)))
    os.environ.update(info)

    distributed.init_process_group(backend=backend)

Example #6

Source File: train_distributed.py From helen with MIT License

6 votes

def setup(rank, device_ids, args):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=len(device_ids))

    train_file, test_file, batch_size, epochs, gpu_mode, num_workers, retrain_model, \
    retrain_model_path, gru_layers, hidden_size, learning_rate, weight_decay, model_dir, stats_dir, total_callers, \
    train_mode = args

    # issue with semaphore lock: https://github.com/pytorch/pytorch/issues/2517
    # mp.set_start_method('spawn')

    # Explicitly setting seed to make sure that models created in two processes
    # start from same random weights and biases. https://github.com/pytorch/pytorch/issues/2517
    torch.manual_seed(42)
    train(train_file, test_file, batch_size, epochs, gpu_mode, num_workers, retrain_model, retrain_model_path,
          gru_layers, hidden_size, learning_rate, weight_decay, model_dir, stats_dir, train_mode,
          total_callers, rank, device_ids[rank])
    cleanup()

Example #7

Source File: dist_a2c_atari.py From cherry with Apache License 2.0

5 votes

def main(env='PongNoFrameskip-v4'):
    num_steps = 5000000
    seed = 42

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", type=int)
    args = parser.parse_args()
    dist.init_process_group('gloo',
   			    init_method='file:///home/seba-1511/.dist_init_' + env,
			    rank=args.local_rank,
			    world_size=16)

    rank = dist.get_rank()
    th.set_num_threads(1)
    random.seed(seed + rank)
    th.manual_seed(seed + rank)
    np.random.seed(seed + rank)

    env = gym.make(env)
    if rank == 0:
        env = envs.Logger(env, interval=1000)
    env = envs.OpenAIAtari(env)
    env = envs.Torch(env)
    env = envs.Runner(env)
    env.seed(seed + rank)

    policy = NatureCNN(env)
    optimizer = optim.RMSprop(policy.parameters(), lr=LR, alpha=0.99, eps=1e-5)
    optimizer = Distributed(policy.parameters(), optimizer)
    get_action = lambda state: get_action_value(state, policy)

    for step in range(num_steps // A2C_STEPS + 1):
        # Sample some transitions
        replay = env.run(get_action, steps=A2C_STEPS)

        # Update policy
        update(replay, optimizer, policy, env=env)

Example #8

Source File: train_IKC.py From IKC with Apache License 2.0

5 votes

def init_dist(backend='nccl', **kwargs):
    ''' initialization for distributed training'''
    # if mp.get_start_method(allow_none=True) is None:
    if mp.get_start_method(allow_none=True) != 'spawn': #Return the name of start method used for starting processes
        mp.set_start_method('spawn', force=True) ##'spawn' is the default on Windows
    rank = int(os.environ['RANK']) #system env process ranks
    num_gpus = torch.cuda.device_count() #Returns the number of GPUs available
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs) #Initializes the default distributed process group

Example #9

Source File: env.py From RDSNet with Apache License 2.0

5 votes

def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

Example #10

Source File: main.py From examples with BSD 3-Clause "New" or "Revised" License

5 votes

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

Example #11

Source File: pytorch_two_machines.py From ncluster with MIT License

5 votes

def worker():
  """ Initialize the distributed environment. """

  import torch
  import torch.distributed as dist
  from torch.multiprocessing import Process
  import numpy as np

  print("Initializing distributed pytorch")
  os.environ['MASTER_ADDR'] = str(args.master_addr)
  os.environ['MASTER_PORT'] = str(args.master_port)
  # Use TCP backend. Gloo needs nightly, where it currently fails with
  #     dist.init_process_group('gloo', rank=args.rank,
  #   AttributeError: module 'torch.distributed' has no attribute 'init_process_group'
  dist.init_process_group('tcp', rank=args.rank,
                          world_size=args.size)

  tensor = torch.ones(args.size_mb*250*1000)*(args.rank+1)
  time_list = []
  outfile = 'out' if args.rank == 0 else '/dev/null'
  log = util.FileLogger(outfile)
  for i in range(args.iters):
    # print('before: rank ', args.rank, ' has data ', tensor[0])

    start_time = time.perf_counter()
    if args.rank == 0:
      dist.send(tensor=tensor, dst=1)
    else:
      dist.recv(tensor=tensor, src=0)
      
    elapsed_time_ms = (time.perf_counter() - start_time)*1000
    time_list.append(elapsed_time_ms)
    # print('after: rank ', args.rank, ' has data ', tensor[0])
    rate = args.size_mb/(elapsed_time_ms/1000)

    log('%03d/%d added %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))

  min = np.min(time_list)
  median = np.median(time_list)
  log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")

Example #12

Source File: env.py From RDSNet with Apache License 2.0

5 votes

def _init_dist_slurm(backend, port=29500, **kwargs):
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        'scontrol show hostname {} | head -n1'.format(node_list))
    os.environ['MASTER_PORT'] = str(port)
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend)

Example #13

Source File: trainer.py From pytorch-asr with GNU General Public License v3.0

5 votes

def init_distributed(use_cuda, backend="nccl", init="slurm", local_rank=-1):
    #try:
    #    mp.set_start_method('spawn')  # spawn, forkserver, and fork
    #except RuntimeError:
    #    pass

    try:
        if local_rank == -1:
            if init == "slurm":
                rank = int(os.environ['SLURM_PROCID'])
                world_size = int(os.environ['SLURM_NTASKS'])
                local_rank = int(os.environ['SLURM_LOCALID'])
                #maser_node = os.environ['SLURM_TOPOLOGY_ADDR']
                #maser_port = '23456'
            elif init == "ompi":
                rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
                world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
                local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])

            if use_cuda:
                device = local_rank % torch.cuda.device_count()
                torch.cuda.set_device(device)
                print(f"set cuda device to cuda:{device}")

            master_node = os.environ["MASTER_ADDR"]
            master_port = os.environ["MASTER_PORT"]
            init_method = f"tcp://{master_node}:{master_port}"
            #init_method = "env://"
            dist.init_process_group(backend=backend, init_method=init_method, world_size=world_size, rank=rank)
            print(f"initialized as {rank}/{world_size} via {init_method}")
        else:
            if use_cuda:
                torch.cuda.set_device(local_rank)
                print(f"set cuda device to cuda:{local_rank}")
            dist.init_process_group(backend=backend, init_method="env://")
            print(f"initialized as {dist.get_rank()}/{dist.get_world_size()} via env://")
    except Exception as e:
        print(f"initialized as single process")

Example #14

Source File: env.py From IoU-Uniform-R-CNN with Apache License 2.0

5 votes

def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

Example #15

Source File: train.py From nonparaSeq2seqVC_code with MIT License

5 votes

def init_distributed(hparams, n_gpus, rank, group_name):
    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
    print("Initializing Distributed")

    # Set cuda device so everything is done on the right GPU.
    torch.cuda.set_device(rank % torch.cuda.device_count())

    # Initialize distributed communication
    dist.init_process_group(
        backend=hparams.dist_backend, init_method=hparams.dist_url,
        world_size=n_gpus, rank=rank, group_name=group_name)

    print("Done initializing distributed")

Example #16

Source File: env.py From IoU-Uniform-R-CNN with Apache License 2.0

5 votes

def _init_dist_slurm(backend, port=29500, **kwargs):
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        'scontrol show hostname {} | head -n1'.format(node_list))
    os.environ['MASTER_PORT'] = str(port)
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend)

Example #17

Source File: example.py From examples with BSD 3-Clause "New" or "Revised" License

5 votes

def spmd_main(local_world_size, local_rank):
    # These are the parameters used to initialize the process group
    env_dict = {
        key: os.environ[key]
        for key in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE")
    }
    print(f"[{os.getpid()}] Initializing process group with: {env_dict}")
    dist.init_process_group(backend="nccl")
    print(
        f"[{os.getpid()}]: world_size = {dist.get_world_size()}, "
        + f"rank = {dist.get_rank()}, backend={dist.get_backend()}"
    )

    demo_basic(local_world_size, local_rank)

    # Tear down the process group
    dist.destroy_process_group()

Example #18

Source File: env.py From kaggle-kuzushiji-recognition with MIT License

5 votes

def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

Example #19

Source File: train_SFTMD.py From IKC with Apache License 2.0

5 votes

def init_dist(backend='nccl', **kwargs):
    ''' initialization for distributed training'''
    # if mp.get_start_method(allow_none=True) is None:
    if mp.get_start_method(allow_none=True) != 'spawn': #Return the name of start method used for starting processes
        mp.set_start_method('spawn', force=True) ##'spawn' is the default on Windows
    rank = int(os.environ['RANK']) #system env process ranks
    num_gpus = torch.cuda.device_count() #Returns the number of GPUs available
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs) #Initializes the default distributed process group

Example #20

Source File: distributed_communicator.py From CrypTen with MIT License

5 votes

def __init__(self, init_ttp=False):
        # no need to do anything if we already initialized the communicator:
        if not dist.is_initialized():
            # get configuration variables from environmens:
            for key in ["distributed_backend", "rendezvous", "world_size", "rank"]:
                if key.upper() not in os.environ:
                    raise ValueError("Environment variable %s must be set." % key)
                setattr(self, key.lower(), os.environ[key.upper()])

            # make sure world size and rank are integers; comms stats are reset:
            self.world_size = int(self.world_size)
            self.rank = int(self.rank)
            self.reset_communication_stats()
            self._name = f"rank{self.rank}"

            # logging:
            logging.info("==================")
            logging.info("DistributedCommunicator with rank %d" % self.rank)
            logging.info("==================")

            # initialize process group:
            total_ws = self.world_size + 1 if init_ttp else self.world_size
            dist.init_process_group(
                backend=self.distributed_backend,
                init_method=self.rendezvous,
                world_size=total_ws,
                rank=self.rank,
            )
            self.ttp_group = dist.new_group(list(range(total_ws)))
            self.main_group = dist.new_group(list(range(self.world_size)))
            self.ttp_initialized = init_ttp
            logging.info("World size = %d" % self.world_size)

Example #21

Source File: distributed_utils.py From Switchable-Whitening with MIT License

5 votes

def dist_init(port):
    if mp.get_start_method(allow_none=True) != 'spawn':
        mp.set_start_method('spawn')
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)

    if '[' in node_list:
        beg = node_list.find('[')
        pos1 = node_list.find('-', beg)
        if pos1 < 0:
            pos1 = 1000
        pos2 = node_list.find(',', beg)
        if pos2 < 0:
            pos2 = 1000
        node_list = node_list[:min(pos1, pos2)].replace('[', '')
    addr = node_list[8:].replace('-', '.')
    print(addr)

    os.environ['MASTER_PORT'] = port
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend='nccl')

    rank = dist.get_rank()
    world_size = dist.get_world_size()
    return rank, world_size

Example #22

Source File: env.py From PolarMask with Apache License 2.0

5 votes

def _init_dist_slurm(backend, port=29500, **kwargs):
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        'scontrol show hostname {} | head -n1'.format(node_list))
    os.environ['MASTER_PORT'] = str(port)
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend)

Example #23

Source File: env.py From PolarMask with Apache License 2.0

5 votes

def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

Example #24

Source File: engine.py From Centripetal-SGD with Apache License 2.0

5 votes

def __init__(self):
        self.version = 0.01
        self.state = State()
        self.devices = None
        self.distributed = False
        self.logger = None

        if 'WORLD_SIZE' in os.environ:
            self.distributed = int(os.environ['WORLD_SIZE']) >= 1

        if self.distributed:
            print('Initialize Engine for distributed training.')
            self.local_rank = 0         # TODO we only use single-machine-multi-gpus
            self.world_size = int(os.environ['WORLD_SIZE'])
            self.world_rank = int(os.environ['RANK'])
            torch.cuda.set_device(self.local_rank)
            dist.init_process_group(backend="nccl", init_method='env://')
            dist.barrier()
            self.devices = [i for i in range(self.world_size)]
        else:
            # todo check non-distributed training
            print('Initialize Engine for non-distributed training.')
            self.world_size = 1
            self.world_rank = 1
            self.devices = parse_torch_devices('0')   # TODO correct?
        torch.backends.cudnn.benchmark = True

Example #25

Source File: env.py From mmdetection_with_SENet154 with Apache License 2.0

5 votes

def _init_dist_slurm(backend, port=29500, **kwargs):
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        'scontrol show hostname {} | head -n1'.format(node_list))
    os.environ['MASTER_PORT'] = str(port)
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend)

Example #26

Source File: env.py From mmdetection_with_SENet154 with Apache License 2.0

5 votes

def _init_dist_pytorch(backend, **kwargs):
    # TODO: use local_rank instead of rank % num_gpus
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

Example #27

Source File: engine.py From ACNet with MIT License

5 votes

def __init__(self):
        self.version = 0.01
        self.state = State()
        self.devices = None
        self.distributed = False
        self.logger = None


        if 'WORLD_SIZE' in os.environ:
            self.distributed = int(os.environ['WORLD_SIZE']) >= 1

        if self.distributed:
            print('Initialize Engine for distributed training.')
            self.local_rank = 0         # TODO we only use single-machine-multi-gpus
            self.world_size = int(os.environ['WORLD_SIZE'])
            self.world_rank = int(os.environ['RANK'])
            torch.cuda.set_device(self.local_rank)
            dist.init_process_group(backend="nccl", init_method='env://')
            dist.barrier()
            self.devices = [i for i in range(self.world_size)]
        else:
            # todo check non-distributed training
            print('Initialize Engine for non-distributed training.')
            self.world_size = 1
            self.world_rank = 1
            self.devices = parse_torch_devices('0')   # TODO correct?
        torch.backends.cudnn.benchmark = True

Example #28

Source File: test_syncbn.py From mmcv with Apache License 2.0

5 votes

def dist_init(self):
        rank = int(os.environ['SLURM_PROCID'])
        world_size = int(os.environ['SLURM_NTASKS'])
        local_rank = int(os.environ['SLURM_LOCALID'])
        node_list = str(os.environ['SLURM_NODELIST'])

        node_parts = re.findall('[0-9]+', node_list)
        host_ip = '{}.{}.{}.{}'.format(node_parts[1], node_parts[2],
                                       node_parts[3], node_parts[4])
        port = '12341'
        init_method = 'tcp://{}:{}'.format(host_ip, port)

        dist.init_process_group(
            'nccl', init_method=init_method, world_size=world_size, rank=rank)
        torch.cuda.set_device(local_rank)

Example #29

Source File: dist_utils.py From mmcv with Apache License 2.0

5 votes

def _init_dist_slurm(backend, port=None):
    """Initialize slurm distributed training environment.

    If argument ``port`` is not specified, then the master port will be system
    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
    environment variable, then a default port ``29500`` will be used.

    Args:
        backend (str): Backend of torch.distributed.
        port (int, optional): Master port. Defaults to None.
    """
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        f'scontrol show hostname {node_list} | head -n1')
    # specify master port
    if port is not None:
        os.environ['MASTER_PORT'] = str(port)
    elif 'MASTER_PORT' in os.environ:
        pass  # use MASTER_PORT in the environment variable
    else:
        # 29500 is torch.distributed default port
        os.environ['MASTER_PORT'] = '29500'
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend)

Example #30

Source File: train_cifar10.py From mmcv with Apache License 2.0

5 votes

def init_dist(backend='nccl', **kwargs):
    if mp.get_start_method(allow_none=True) is None:
        mp.set_start_method('spawn')
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)