Python torch.distributed.new_group() Examples

The following are 17 code examples of torch.distributed.new_group(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.distributed , or try the search function .
Example #1
Source File: main.py    From pytorch-distributed-example with MIT License 6 votes vote down vote up
def run(world_size, rank, steps):
    for step in range(1, steps + 1):
        # get random int
        value = randint(0, 10)

        # group all ranks
        ranks = list(range(world_size))
        group = dist.new_group(ranks=ranks)

        # compute reduced sum
        tensor = torch.tensor(value, dtype=torch.int)
        dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)

        print('rank: {}, step: {}, value: {}, reduced sum: {}.'.format(rank, step, value, tensor.item()))

        sleep(1) 
Example #2
Source File: ray_container.py    From adeptRL with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)
        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if worker_rank == rank:
                    groups[learner_rank] = g
        dist.new_group(learner_ranks)

        self.groups = groups
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank
        self.network = torch.zeros(3).to(self.device)
        self.exp = None
        self.network_handle = None 
Example #3
Source File: data_parallel_dist.py    From ps_pytorch with MIT License 6 votes vote down vote up
def _start_reduction_threads(self):
        num_buckets = len(self.bucket_sizes)
        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
        self._reduction_threads = []
        self._reduction_streams = [[] for _ in range(num_buckets)]
        self._nccl_streams = []
        self._default_streams = []
        for dev_id in self.device_ids:
            with torch.cuda.device(dev_id):
                # TODO: don't assume we're on a default stream
                self._default_streams.append(torch.cuda.current_stream())
                self._nccl_streams.append(torch.cuda.Stream())
        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
            for dev_id in self.device_ids:
                with torch.cuda.device(dev_id):
                    reduction_streams.append(torch.cuda.Stream())
            # We only use the first device for distributed reductions
            dist._register_stream(reduction_streams[0])
            group_id = dist.new_group()

            self._reduction_threads.append(threading.Thread(
                target=self._reduction_thread_fn,
                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
            self._reduction_threads[-1].daemon = True
            self._reduction_threads[-1].start() 
Example #4
Source File: distributed.py    From SlowFast with Apache License 2.0 6 votes vote down vote up
def init_distributed_training(cfg):
    """
    Initialize variables needed for distributed training.
    """
    if cfg.NUM_GPUS == 1:
        return
    num_gpus_per_machine = cfg.NUM_GPUS
    num_machines = dist.get_world_size() // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(
            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
        )
        pg = dist.new_group(ranks_on_i)
        if i == cfg.SHARD_ID:
            global _LOCAL_PROCESS_GROUP
            _LOCAL_PROCESS_GROUP = pg 
Example #5
Source File: comm.py    From detectron2 with Apache License 2.0 5 votes vote down vote up
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
Example #6
Source File: dist_utils.py    From video_analyst with MIT License 5 votes vote down vote up
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
Example #7
Source File: distributed.py    From SlowFast with Apache License 2.0 5 votes vote down vote up
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    Returns:
        (group): pytorch dist group.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
Example #8
Source File: comm.py    From detectron2 with Apache License 2.0 5 votes vote down vote up
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
Example #9
Source File: launch.py    From detectron2 with Apache License 2.0 5 votes vote down vote up
def _distributed_worker(
    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
):
    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(
            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
        )
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e
    # synchronize is needed here to prevent a possible timeout after calling init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert comm._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            comm._LOCAL_PROCESS_GROUP = pg

    main_func(*args) 
Example #10
Source File: main.py    From elastic with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def tmp_process_group(backend):
    cpu_pg = dist.new_group(backend=backend)
    try:
        yield cpu_pg
    finally:
        dist.destroy_process_group(cpu_pg) 
Example #11
Source File: distributed.py    From torchsupport with MIT License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
    super(SynchronousDistributedTraining, self).__init__(*args, **kwargs)
    self.world_size = distributed.get_world_size()
    self.rank = distributed.get_rank()
    self.group = distributed.new_group(ranks=list(range(self.world_size))) 
Example #12
Source File: launch.py    From detectron2 with Apache License 2.0 5 votes vote down vote up
def _distributed_worker(
    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
):
    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(
            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
        )
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e
    # synchronize is needed here to prevent a possible timeout after calling init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert comm._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            comm._LOCAL_PROCESS_GROUP = pg

    main_func(*args) 
Example #13
Source File: comm.py    From fast-reid with Apache License 2.0 5 votes vote down vote up
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
Example #14
Source File: group.py    From inplace_abn with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def active_group(active):
    """Initialize a distributed group where each process can independently decide whether to participate or not

    Parameters
    ----------
    active : bool
        Whether this process will be active in the group or not

    Returns
    -------
        A distributed group containing all processes that passed `active=True`, or `None` if all passed `False`
    """
    world_size = distributed.get_world_size()
    rank = distributed.get_rank()

    # Check if cache is initialized, add WORLD and None to it
    if not hasattr(active_group, "__cache__"):
        active_group.__cache__ = {
            frozenset(range(world_size)): distributed.group.WORLD,
            frozenset(): None
        }

    # Gather active status from all workers
    active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device())
    active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device())
    distributed.all_gather(list(active_workers.unbind(0)), active)

    # Create and cache group if it doesn't exist yet
    active_workers = frozenset(int(i) for i in active_workers.tolist() if i != -1)
    if active_workers not in active_group.__cache__:
        group = distributed.new_group(list(active_workers))
        active_group.__cache__[active_workers] = group

    return active_group.__cache__[active_workers] 
Example #15
Source File: distributed_communicator.py    From CrypTen with MIT License 5 votes vote down vote up
def __init__(self, init_ttp=False):
        # no need to do anything if we already initialized the communicator:
        if not dist.is_initialized():
            # get configuration variables from environmens:
            for key in ["distributed_backend", "rendezvous", "world_size", "rank"]:
                if key.upper() not in os.environ:
                    raise ValueError("Environment variable %s must be set." % key)
                setattr(self, key.lower(), os.environ[key.upper()])

            # make sure world size and rank are integers; comms stats are reset:
            self.world_size = int(self.world_size)
            self.rank = int(self.rank)
            self.reset_communication_stats()
            self._name = f"rank{self.rank}"

            # logging:
            logging.info("==================")
            logging.info("DistributedCommunicator with rank %d" % self.rank)
            logging.info("==================")

            # initialize process group:
            total_ws = self.world_size + 1 if init_ttp else self.world_size
            dist.init_process_group(
                backend=self.distributed_backend,
                init_method=self.rendezvous,
                world_size=total_ws,
                rank=self.rank,
            )
            self.ttp_group = dist.new_group(list(range(total_ws)))
            self.main_group = dist.new_group(list(range(self.world_size)))
            self.ttp_initialized = init_ttp
            logging.info("World size = %d" % self.world_size) 
Example #16
Source File: ray_container.py    From adeptRL with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)

        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if learner_rank == rank:
                    groups[worker_rank] = g
        learner_group = dist.new_group(learner_ranks)

        self.groups = groups
        self.learner_group = learner_group
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank

        self.exps = {
            w_rank: torch.zeros(2).to(self.device) for w_rank in worker_ranks
        }
        self.network = torch.ones(3).to(self.device)
        self.network_grads = [torch.ones(3).to(self.device)]
        self.exp_handles = None 
Example #17
Source File: distributed.py    From torchsupport with MIT License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
    super(AsynchronousDistributedTraining, self).__init__(*args, **kwargs)
    self.gossip_step = 0
    self.world_size = distributed.get_world_size()
    self.rank = distributed.get_rank()
    self.groups = []
    for idx in range(self.world_size - 1):
      partner = (self.rank + idx + 1) % self.world_size
      group = distributed.new_group(ranks=[self.rank, partner])
      self.groups.append(group)