test/distributed/algorithms/test_join.py - platform/external/pytorch - Git at Google

 # Owner(s): ["oncall: distributed"]

 import contextlib
 import os
 import sys
 from typing import Any, Optional

 import torch
 import torch.distributed as dist

 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)

 from torch.distributed.algorithms.join import Join, Joinable, JoinHook
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     require_n_gpus_for_nccl_backend,
 )
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN

 if TEST_WITH_DEV_DBG_ASAN:
     print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
     sys.exit(0)

 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
 WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))

 # Constants used for testing post-hooks
 BEFORE_CONSTANT = 41
 AFTER_CONSTANT = 42


 class AllReducerJoinHook(JoinHook):
     r"""
     Join hook for :class:`AllReducer`.

     Arguments:
         allreducer (AllReducer): the :class:`AllReducer` object using this
             hook.
         num_allreduces (int): the number of all-reduces to shadow per
             iteration.
         run_post_hook (bool): a flag enabling the post-hook logic.
     """
     def __init__(
         self,
         allreducer,
         num_allreduces,
         run_post_hook
     ):
         self.allreducer = allreducer
         self.num_allreduces = num_allreduces
         self.run_post_hook = run_post_hook

     def main_hook(self):
         r"""
         Shadows each all-reduce; the number of all-reduces is passed into the
         constructor as ``num_allreduces``.
         """
         device = self.allreducer.device
         for _ in range(self.num_allreduces):
             t = torch.zeros(1, device=device)
             dist.all_reduce(t)

     def post_hook(self, is_last_joiner: bool):
         r"""
         Broadcasts a tensor containing a magic constant ``AFTER_CONSTANT`` from
         the last joiner to all other processes.
         """
         if not self.run_post_hook:
             return
         rank = dist.get_rank(self.allreducer.process_group)
         common_rank = self.allreducer.find_common_rank(rank, is_last_joiner)
         device = self.allreducer.device
         if rank == common_rank:
             self.allreducer.post_hook_tensor = torch.tensor([AFTER_CONSTANT], device=device)
         dist.broadcast(self.allreducer.post_hook_tensor, src=common_rank)


 class AllReducer(Joinable):
     r"""
     Example :class:`Joinable` that performs some number of all-reduces as its
     per-iteration collective communication.
     """
     def __init__(self, device, process_group):
         super(AllReducer, self).__init__()
         self.device = device
         self.process_group = process_group
         self.post_hook_tensor = torch.tensor([BEFORE_CONSTANT], device=self.device)

     def __call__(self, num_allreduces=1):
         r"""
         All-reduces a dim-1 one tensor ``num_allreduces``-many times, and
         returns the total result.
         """
         Join.notify_join_context(self)
         device = self.device
         total = 0
         for _ in range(num_allreduces):
             t = torch.ones(1, device=device)
             dist.all_reduce(t)
             total += t.item()
         return total

     def join_hook(self, **kwargs) -> JoinHook:
         r"""
         Returns a join hook that shadows some number of all-reduces; by default,
         this number is 1.
         """
         num_allreduces = kwargs.get("num_allreduces", 1)
         run_post_hook = kwargs.get("run_post_hooks", False)
         return AllReducerJoinHook(
             self,
             num_allreduces,
             run_post_hook
         )

     @property
     def join_device(self) -> torch.device:
         return self.device

     @property
     def join_process_group(self) -> Any:
         return self.process_group

     def find_common_rank(self, rank, to_consider):
         r"""
         Returns the max rank of the ones to consider over the process group.
         """
         common_rank = torch.tensor(
             [rank if to_consider else -1],
             device=self.device
         )
         dist.all_reduce(common_rank, op=dist.ReduceOp.MAX, group=self.process_group)
         common_rank = common_rank.item()
         assert common_rank >= 0
         return common_rank

 class TestJoin(MultiProcessTestCase):
     r"""Test cases for the generic join context."""
     def setUp(self):
         super(TestJoin, self).setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
         os.environ["BACKEND"] = BACKEND
         self._spawn_processes()

     @property
     def device(self):
         return torch.device(self.rank) if BACKEND == dist.Backend.NCCL \
             else torch.device("cpu")

     @property
     def world_size(self):
         return WORLD_SIZE

     @property
     def process_group(self):
         return dist.group.WORLD

     def tearDown(self):
         try:
             dist.destroy_process_group()
         except AssertionError:
             pass
         try:
             os.remove(self.file_name)
         except OSError:
             pass

     def dist_init(self, rank, world_size, backend=BACKEND):
         store = dist.FileStore(self.file_name, world_size)
         return dist.init_process_group(
             backend=backend,
             store=store,
             rank=rank,
             world_size=world_size
         )

     def construct_uneven_inputs(self, base, offset, device=None):
         r"""
         Returns uneven inputs: rank i gets ``base`` + i * ``offset`` inputs.
         """
         if device is None:
             device = self.device
         return [torch.zeros(1, device=device) for _ in range(base + self.rank * offset)]

     def construct_even_inputs(self, base, device=None):
         r"""Returns even inputs: each rank gets ``base`` inputs."""
         if device is None:
             device = self.device
         return [torch.zeros(1, device=device) for _ in range(base)]

     @property
     def base_num_inputs(self):
         r"""Base number of inputs to be used by all ranks."""
         return 3

     @property
     def offset(self):
         r"""Rank i gets i * ``offset`` additional inputs."""
         return 1

     def _test_join_base(
         self,
         uneven_inputs: bool,
         num_joinables: int,
         enable: bool,
         throw_on_early_termination: bool,
         num_allreduces: int,
         run_post_hooks: bool,
         expected_total: Optional[int] = None,
     ):
         r"""
         Skeleton for all :class:`Join` tests.

         Arguments:
             uneven_inputs (bool): ``True`` to use uneven inputs; ``False``
                 otherwise.
             num_joinables (int): number of :class:`AllReducer` s to construct.
             enable (bool): ``True`` to enable the join context manager;
                 ``False`` otherwise.
             throw_on_early_termination (bool): ``True`` to raise an exception
                 upon detecting uneven inputs; ``False`` otherwise.
             num_allreduces (int): number of all-reduces to perform per input.
             run_post_hooks (bool): ``True`` to run post-hooks; ``False``
                 otherwise.
             expected_total (Optional[int]): ``None`` to not check the expected
                 all-reduce total; otherwise, the expected total; default is
                 ``None``.
         """
         self.dist_init(self.rank, self.world_size)

         allreducers = [
             AllReducer(self.device, self.process_group)
             for _ in range(num_joinables)
         ]
         for allreducer in allreducers:
             self.assertEqual(allreducer.post_hook_tensor.item(), BEFORE_CONSTANT)

         inputs = self.construct_uneven_inputs(self.base_num_inputs, self.offset) \
             if uneven_inputs \
             else self.construct_even_inputs(self.base_num_inputs)
         allreduce_total = 0

         # Expect a `RuntimeError` if `throw_on_early_termination=True`
         # Rank 0 exhausts its inputs first
         expected_msg = "Rank 0 exhausted all inputs." if self.rank == 0 \
             else "Detected at least one rank that exhausted inputs. " \
             "Throwing across all ranks."
         with self.assertRaisesRegex(
             RuntimeError,
             expected_msg
         ) if throw_on_early_termination else contextlib.suppress():
             with Join(
                 allreducers,
                 enable=enable,
                 throw_on_early_termination=throw_on_early_termination,
                 num_allreduces=num_allreduces,
                 run_post_hooks=run_post_hooks
             ):
                 for _ in inputs:
                     for allreducer in allreducers:
                         allreduce_total += allreducer(num_allreduces)

         if throw_on_early_termination:
             return

         # Check `expected_total` if not `None`
         if expected_total:
             self.assertEqual(allreduce_total, expected_total)

         # All `AllReduce` instances should receive the updated
         # `post_hook_tensor` from the last-joined process
         if run_post_hooks:
             for allreducer in allreducers:
                 self.assertEqual(allreducer.post_hook_tensor.item(), AFTER_CONSTANT)

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_single_joinable_main_hooks(self):
         r"""Tests the main hooks of a single :class:`Joinable`."""
         num_joinables = 1
         num_allreduces = 1
         run_post_hooks = False
         # Non-joined processes all-reduce a 1, so this rank's all-reduce total
         # should be precisely equal to the total number of inputs processed
         # before it joined
         expected_total = self.world_size * self.base_num_inputs
         # Rank i runs for i additional iterations
         for num_joined in range(1, self.rank + 1):
             expected_total += (self.world_size - num_joined) * self.offset

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=expected_total
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_single_joinable_post_hooks(self):
         r"""Tests the post-hooks of a single :class:`Joinable`."""
         num_joinables = 1
         num_allreduces = 0  # set to 0 to skip the main hooks
         run_post_hooks = False

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=None
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_single_joinable(self):
         r"""
         Tests the main hooks and post-hooks of a single :class:`Joinable`
         together.

         This combines ``test_single_joinable_main_hooks()`` and
         ``test_single_joinable_post_hooks()`` into a single test to ensure that
         main hooks and post-hooks operate correctly together.
         """
         num_joinables = 1
         num_allreduces = 1
         run_post_hooks = True

         expected_total = self.world_size * self.base_num_inputs
         for num_joined in range(1, self.rank + 1):
             expected_total += (self.world_size - num_joined) * self.offset

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=expected_total
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_multiple_joinables(self):
         r"""
         Tests the main hooks and post-hooks of multiple :class:`Joinable` s
         together.

         This generalizes ``test_single_joinable()`` to multiple
         :class:`Joinable` s.
         """
         num_joinables = 3
         num_allreduces = 1
         run_post_hooks = True

         expected_total = self.world_size * self.base_num_inputs
         for num_joined in range(1, self.rank + 1):
             expected_total += (self.world_size - num_joined) * self.offset
         # The expected total is now multiplied by a factor of `NUM_JOINABLES`
         expected_total *= num_joinables

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=expected_total
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_single_joinable_disable(self):
         r"""Tests ``enable=False`` for a single :class:`Joinable`."""
         num_joinables = 1
         num_allreduces = 1
         uneven_inputs = False
         enable = False
         run_post_hooks = False

         expected_total = self.world_size * self.base_num_inputs

         self._test_join_base(
             uneven_inputs=uneven_inputs,
             num_joinables=num_joinables,
             enable=enable,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=expected_total
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_multiple_joinable_disable(self):
         r"""
         Tests ``enable=False`` for multiple :class:`Joinable` s.

         This generalizes ``test_single_joinable_disable`` to multiple
         :class:`Joinable` s.
         """
         num_joinables = 3
         num_allreduces = 1
         uneven_inputs = False
         enable = False
         run_post_hooks = False

         expected_total = self.world_size * self.base_num_inputs * num_joinables

         self._test_join_base(
             uneven_inputs=uneven_inputs,
             num_joinables=num_joinables,
             enable=enable,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=expected_total
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_single_joinable_throw(self):
         r"""
         Tests ``throw_on_early_termination=True`` for a single
         :class:`Joinable`.
         """
         num_joinables = 1
         num_allreduces = 1
         throw_on_early_termination = True
         run_post_hooks = False

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=throw_on_early_termination,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=None
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_multiple_joinables_throw(self):
         r"""
         Tests ``throw_on_early_termination=True`` for multiple
         :class:`Joinable` s together.

         This generalizes ``test_single_joinable_throw`` to multiple
         :class:`Joinable` s.
         """
         num_joinables = 3
         num_allreduces = 1
         throw_on_early_termination = True
         run_post_hooks = False

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=throw_on_early_termination,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=None
         )

     @require_n_gpus_for_nccl_backend(
         WORLD_SIZE, BACKEND
     )
     def test_join_kwargs(self):
         r"""
         Tests passing keyword arguments to the context manager.
         """
         num_joinables = 1
         num_allreduces = 2
         run_post_hooks = False

         expected_total = self.world_size * self.base_num_inputs
         for num_joined in range(1, self.rank + 1):
             expected_total += (self.world_size - num_joined) * self.offset
         # The expected total is now multiplied by a factor of `NUM_ALLREDUCES`
         expected_total *= num_allreduces

         self._test_join_base(
             uneven_inputs=True,
             num_joinables=num_joinables,
             enable=True,
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
             expected_total=expected_total
         )

 if __name__ == "__main__":
     run_tests()
	# Owner(s): ["oncall: distributed"]

	import contextlib
	import os
	import sys
	from typing import Any, Optional

	import torch
	import torch.distributed as dist

	if not dist.is_available():
	print("Distributed not available, skipping tests", file=sys.stderr)
	sys.exit(0)

	from torch.distributed.algorithms.join import Join, Joinable, JoinHook
	from torch.testing._internal.common_distributed import (
	MultiProcessTestCase,
	require_n_gpus_for_nccl_backend,
	)
	from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN

	if TEST_WITH_DEV_DBG_ASAN:
	print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
	sys.exit(0)

	BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
	WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))

	# Constants used for testing post-hooks
	BEFORE_CONSTANT = 41
	AFTER_CONSTANT = 42


	class AllReducerJoinHook(JoinHook):
	r"""
	Join hook for :class:`AllReducer`.

	Arguments:
	allreducer (AllReducer): the :class:`AllReducer` object using this
	hook.
	num_allreduces (int): the number of all-reduces to shadow per
	iteration.
	run_post_hook (bool): a flag enabling the post-hook logic.
	"""
	def __init__(
	self,
	allreducer,
	num_allreduces,
	run_post_hook
	):
	self.allreducer = allreducer
	self.num_allreduces = num_allreduces
	self.run_post_hook = run_post_hook

	def main_hook(self):
	r"""
	Shadows each all-reduce; the number of all-reduces is passed into the
	constructor as ``num_allreduces``.
	"""
	device = self.allreducer.device
	for _ in range(self.num_allreduces):
	t = torch.zeros(1, device=device)
	dist.all_reduce(t)

	def post_hook(self, is_last_joiner: bool):
	r"""
	Broadcasts a tensor containing a magic constant ``AFTER_CONSTANT`` from
	the last joiner to all other processes.
	"""
	if not self.run_post_hook:
	return
	rank = dist.get_rank(self.allreducer.process_group)
	common_rank = self.allreducer.find_common_rank(rank, is_last_joiner)
	device = self.allreducer.device
	if rank == common_rank:
	self.allreducer.post_hook_tensor = torch.tensor([AFTER_CONSTANT], device=device)
	dist.broadcast(self.allreducer.post_hook_tensor, src=common_rank)


	class AllReducer(Joinable):
	r"""
	Example :class:`Joinable` that performs some number of all-reduces as its
	per-iteration collective communication.
	"""
	def __init__(self, device, process_group):
	super(AllReducer, self).__init__()
	self.device = device
	self.process_group = process_group
	self.post_hook_tensor = torch.tensor([BEFORE_CONSTANT], device=self.device)

	def __call__(self, num_allreduces=1):
	r"""
	All-reduces a dim-1 one tensor ``num_allreduces``-many times, and
	returns the total result.
	"""
	Join.notify_join_context(self)
	device = self.device
	total = 0
	for _ in range(num_allreduces):
	t = torch.ones(1, device=device)
	dist.all_reduce(t)
	total += t.item()
	return total

	def join_hook(self, **kwargs) -> JoinHook:
	r"""
	Returns a join hook that shadows some number of all-reduces; by default,
	this number is 1.
	"""
	num_allreduces = kwargs.get("num_allreduces", 1)
	run_post_hook = kwargs.get("run_post_hooks", False)
	return AllReducerJoinHook(
	self,
	num_allreduces,
	run_post_hook
	)

	@property
	def join_device(self) -> torch.device:
	return self.device

	@property
	def join_process_group(self) -> Any:
	return self.process_group

	def find_common_rank(self, rank, to_consider):
	r"""
	Returns the max rank of the ones to consider over the process group.
	"""
	common_rank = torch.tensor(
	[rank if to_consider else -1],
	device=self.device
	)
	dist.all_reduce(common_rank, op=dist.ReduceOp.MAX, group=self.process_group)
	common_rank = common_rank.item()
	assert common_rank >= 0
	return common_rank

	class TestJoin(MultiProcessTestCase):
	r"""Test cases for the generic join context."""
	def setUp(self):
	super(TestJoin, self).setUp()
	os.environ["WORLD_SIZE"] = str(self.world_size)
	os.environ["BACKEND"] = BACKEND
	self._spawn_processes()

	@property
	def device(self):
	return torch.device(self.rank) if BACKEND == dist.Backend.NCCL \
	else torch.device("cpu")

	@property
	def world_size(self):
	return WORLD_SIZE

	@property
	def process_group(self):
	return dist.group.WORLD

	def tearDown(self):
	try:
	dist.destroy_process_group()
	except AssertionError:
	pass
	try:
	os.remove(self.file_name)
	except OSError:
	pass

	def dist_init(self, rank, world_size, backend=BACKEND):
	store = dist.FileStore(self.file_name, world_size)
	return dist.init_process_group(
	backend=backend,
	store=store,
	rank=rank,
	world_size=world_size
	)

	def construct_uneven_inputs(self, base, offset, device=None):
	r"""
	Returns uneven inputs: rank i gets ``base`` + i * ``offset`` inputs.
	"""
	if device is None:
	device = self.device
	return [torch.zeros(1, device=device) for _ in range(base + self.rank * offset)]

	def construct_even_inputs(self, base, device=None):
	r"""Returns even inputs: each rank gets ``base`` inputs."""
	if device is None:
	device = self.device
	return [torch.zeros(1, device=device) for _ in range(base)]

	@property
	def base_num_inputs(self):
	r"""Base number of inputs to be used by all ranks."""
	return 3

	@property
	def offset(self):
	r"""Rank i gets i * ``offset`` additional inputs."""
	return 1

	def _test_join_base(
	self,
	uneven_inputs: bool,
	num_joinables: int,
	enable: bool,
	throw_on_early_termination: bool,
	num_allreduces: int,
	run_post_hooks: bool,
	expected_total: Optional[int] = None,
	):
	r"""
	Skeleton for all :class:`Join` tests.

	Arguments:
	uneven_inputs (bool): ``True`` to use uneven inputs; ``False``
	otherwise.
	num_joinables (int): number of :class:`AllReducer` s to construct.
	enable (bool): ``True`` to enable the join context manager;
	``False`` otherwise.
	throw_on_early_termination (bool): ``True`` to raise an exception
	upon detecting uneven inputs; ``False`` otherwise.
	num_allreduces (int): number of all-reduces to perform per input.
	run_post_hooks (bool): ``True`` to run post-hooks; ``False``
	otherwise.
	expected_total (Optional[int]): ``None`` to not check the expected
	all-reduce total; otherwise, the expected total; default is
	``None``.
	"""
	self.dist_init(self.rank, self.world_size)

	allreducers = [
	AllReducer(self.device, self.process_group)
	for _ in range(num_joinables)
	]
	for allreducer in allreducers:
	self.assertEqual(allreducer.post_hook_tensor.item(), BEFORE_CONSTANT)

	inputs = self.construct_uneven_inputs(self.base_num_inputs, self.offset) \
	if uneven_inputs \
	else self.construct_even_inputs(self.base_num_inputs)
	allreduce_total = 0

	# Expect a `RuntimeError` if `throw_on_early_termination=True`
	# Rank 0 exhausts its inputs first
	expected_msg = "Rank 0 exhausted all inputs." if self.rank == 0 \
	else "Detected at least one rank that exhausted inputs. " \
	"Throwing across all ranks."
	with self.assertRaisesRegex(
	RuntimeError,
	expected_msg
	) if throw_on_early_termination else contextlib.suppress():
	with Join(
	allreducers,
	enable=enable,
	throw_on_early_termination=throw_on_early_termination,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks
	):
	for _ in inputs:
	for allreducer in allreducers:
	allreduce_total += allreducer(num_allreduces)

	if throw_on_early_termination:
	return

	# Check `expected_total` if not `None`
	if expected_total:
	self.assertEqual(allreduce_total, expected_total)

	# All `AllReduce` instances should receive the updated
	# `post_hook_tensor` from the last-joined process
	if run_post_hooks:
	for allreducer in allreducers:
	self.assertEqual(allreducer.post_hook_tensor.item(), AFTER_CONSTANT)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_single_joinable_main_hooks(self):
	r"""Tests the main hooks of a single :class:`Joinable`."""
	num_joinables = 1
	num_allreduces = 1
	run_post_hooks = False
	# Non-joined processes all-reduce a 1, so this rank's all-reduce total
	# should be precisely equal to the total number of inputs processed
	# before it joined
	expected_total = self.world_size * self.base_num_inputs
	# Rank i runs for i additional iterations
	for num_joined in range(1, self.rank + 1):
	expected_total += (self.world_size - num_joined) * self.offset

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=expected_total
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_single_joinable_post_hooks(self):
	r"""Tests the post-hooks of a single :class:`Joinable`."""
	num_joinables = 1
	num_allreduces = 0 # set to 0 to skip the main hooks
	run_post_hooks = False

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=None
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_single_joinable(self):
	r"""
	Tests the main hooks and post-hooks of a single :class:`Joinable`
	together.

	This combines ``test_single_joinable_main_hooks()`` and
	``test_single_joinable_post_hooks()`` into a single test to ensure that
	main hooks and post-hooks operate correctly together.
	"""
	num_joinables = 1
	num_allreduces = 1
	run_post_hooks = True

	expected_total = self.world_size * self.base_num_inputs
	for num_joined in range(1, self.rank + 1):
	expected_total += (self.world_size - num_joined) * self.offset

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=expected_total
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_multiple_joinables(self):
	r"""
	Tests the main hooks and post-hooks of multiple :class:`Joinable` s
	together.

	This generalizes ``test_single_joinable()`` to multiple
	:class:`Joinable` s.
	"""
	num_joinables = 3
	num_allreduces = 1
	run_post_hooks = True

	expected_total = self.world_size * self.base_num_inputs
	for num_joined in range(1, self.rank + 1):
	expected_total += (self.world_size - num_joined) * self.offset
	# The expected total is now multiplied by a factor of `NUM_JOINABLES`
	expected_total *= num_joinables

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=expected_total
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_single_joinable_disable(self):
	r"""Tests ``enable=False`` for a single :class:`Joinable`."""
	num_joinables = 1
	num_allreduces = 1
	uneven_inputs = False
	enable = False
	run_post_hooks = False

	expected_total = self.world_size * self.base_num_inputs

	self._test_join_base(
	uneven_inputs=uneven_inputs,
	num_joinables=num_joinables,
	enable=enable,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=expected_total
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_multiple_joinable_disable(self):
	r"""
	Tests ``enable=False`` for multiple :class:`Joinable` s.

	This generalizes ``test_single_joinable_disable`` to multiple
	:class:`Joinable` s.
	"""
	num_joinables = 3
	num_allreduces = 1
	uneven_inputs = False
	enable = False
	run_post_hooks = False

	expected_total = self.world_size * self.base_num_inputs * num_joinables

	self._test_join_base(
	uneven_inputs=uneven_inputs,
	num_joinables=num_joinables,
	enable=enable,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=expected_total
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_single_joinable_throw(self):
	r"""
	Tests ``throw_on_early_termination=True`` for a single
	:class:`Joinable`.
	"""
	num_joinables = 1
	num_allreduces = 1
	throw_on_early_termination = True
	run_post_hooks = False

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=throw_on_early_termination,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=None
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_multiple_joinables_throw(self):
	r"""
	Tests ``throw_on_early_termination=True`` for multiple
	:class:`Joinable` s together.

	This generalizes ``test_single_joinable_throw`` to multiple
	:class:`Joinable` s.
	"""
	num_joinables = 3
	num_allreduces = 1
	throw_on_early_termination = True
	run_post_hooks = False

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=throw_on_early_termination,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=None
	)

	@require_n_gpus_for_nccl_backend(
	WORLD_SIZE, BACKEND
	)
	def test_join_kwargs(self):
	r"""
	Tests passing keyword arguments to the context manager.
	"""
	num_joinables = 1
	num_allreduces = 2
	run_post_hooks = False

	expected_total = self.world_size * self.base_num_inputs
	for num_joined in range(1, self.rank + 1):
	expected_total += (self.world_size - num_joined) * self.offset
	# The expected total is now multiplied by a factor of `NUM_ALLREDUCES`
	expected_total *= num_allreduces

	self._test_join_base(
	uneven_inputs=True,
	num_joinables=num_joinables,
	enable=True,
	throw_on_early_termination=False,
	num_allreduces=num_allreduces,
	run_post_hooks=run_post_hooks,
	expected_total=expected_total
	)

	if __name__ == "__main__":
	run_tests()