From 5cc75e46fa85cbb69d8220c46aba1a704fedd4e7 Mon Sep 17 00:00:00 2001 From: Pavel Belevich Date: Wed, 21 Apr 2021 22:09:13 -0700 Subject: [PATCH] Split test_c10d.py to test_c10d_common.py, test_c10d_gloo.py, test_c10d_nccl.py (#56598) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/56598 Test Plan: NA Reviewed By: SciPioneer Differential Revision: D27913170 fbshipit-source-id: 3439d18141131b02d55f2ca399a4c795cba2b04b --- .jenkins/pytorch/multigpu-test.sh | 4 +- .../win-test-helpers/test_distributed.bat | 8 +- test/distributed/test_c10d.py | 5125 ----------------- test/distributed/test_c10d_common.py | 970 ++++ test/distributed/test_c10d_gloo.py | 2072 +++++++ test/distributed/test_c10d_nccl.py | 2318 ++++++++ test/distributed/test_c10d_spawn.py | 416 +- test/run_test.py | 8 +- torch/distributed/CONTRIBUTING.md | 4 +- torch/lib/c10d/ProcessGroupGloo.cpp | 2 +- 10 files changed, 5383 insertions(+), 5544 deletions(-) delete mode 100644 test/distributed/test_c10d.py create mode 100644 test/distributed/test_c10d_common.py create mode 100644 test/distributed/test_c10d_gloo.py create mode 100644 test/distributed/test_c10d_nccl.py diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh index 61c25a3412d..e7a56d26439 100755 --- a/.jenkins/pytorch/multigpu-test.sh +++ b/.jenkins/pytorch/multigpu-test.sh @@ -20,7 +20,9 @@ python tools/download_mnist.py --quiet -d test/cpp/api/mnist OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api time python test/run_test.py --verbose -i distributed/test_jit_c10d time python test/run_test.py --verbose -i distributed/test_distributed_fork -time python test/run_test.py --verbose -i distributed/test_c10d +time python test/run_test.py --verbose -i distributed/test_c10d_common +time python test/run_test.py --verbose -i distributed/test_c10d_gloo +time python test/run_test.py --verbose -i distributed/test_c10d_nccl time python test/run_test.py --verbose -i distributed/test_c10d_spawn time python test/run_test.py --verbose -i distributed/rpc/cuda/test_process_group_agent time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent diff --git a/.jenkins/pytorch/win-test-helpers/test_distributed.bat b/.jenkins/pytorch/win-test-helpers/test_distributed.bat index 726c9adaa99..782d598d553 100644 --- a/.jenkins/pytorch/win-test-helpers/test_distributed.bat +++ b/.jenkins/pytorch/win-test-helpers/test_distributed.bat @@ -2,7 +2,13 @@ REM The first argument should lead to the python interpreter %1\python.exe test/run_test.py --verbose -i distributed/test_jit_c10d if %errorlevel% neq 0 ( exit /b %errorlevel% ) -%1\python.exe test/run_test.py --verbose -i distributed/test_c10d +%1\python.exe test/run_test.py --verbose -i distributed/test_c10d_common +if %errorlevel% neq 0 ( exit /b %errorlevel% ) + +%1\python.exe test/run_test.py --verbose -i distributed/test_c10d_gloo +if %errorlevel% neq 0 ( exit /b %errorlevel% ) + +%1\python.exe test/run_test.py --verbose -i distributed/test_c10d_nccl if %errorlevel% neq 0 ( exit /b %errorlevel% ) %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py deleted file mode 100644 index 1b69a600b6c..00000000000 --- a/test/distributed/test_c10d.py +++ /dev/null @@ -1,5125 +0,0 @@ -import copy -import logging -import math -import operator -import os -import random -import signal -import sys -import tempfile -import threading -import time -import traceback -import unittest -from unittest import mock -from contextlib import contextmanager -from datetime import timedelta -from functools import reduce -from itertools import groupby, product -from sys import platform -import numpy - -import torch -import torch.distributed as c10d - -if not c10d.is_available(): - print("c10d not available, skipping tests", file=sys.stderr) - sys.exit(0) - -import torch.distributed as dist -import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default -import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD -import torch.multiprocessing as mp -import torch.nn.functional as F -import torch.testing._internal.common_utils as common -from torch import nn -from torch._six import string_classes -from torch.nn.parallel import DistributedDataParallel -from torch.utils.checkpoint import checkpoint -from torch.testing._internal.common_distributed import ( - MultiProcessTestCase, - requires_gloo, - requires_nccl, - requires_nccl_version, - skip_if_lt_x_gpu, - get_timeout, - skip_if_rocm, - simple_sparse_reduce_tests, - skip_if_win32, - create_device, - with_dist_debug_levels, - with_nccl_blocking_wait, - create_tcp_store, -) -from torch.testing._internal.common_utils import ( - TestCase, - load_tests, - run_tests, - retry_on_connect_failures, - ADDRESS_IN_USE, - CONNECT_TIMEOUT, - TEST_WITH_TSAN, - IS_WINDOWS, -) - - -# load_tests from common_utils is used to automatically filter tests for -# sharding on sandcastle. This line silences flake warnings -load_tests = load_tests - - -if platform == "darwin": - LOOPBACK = "lo0" -else: - LOOPBACK = "lo" - -DEFAULT_HOSTNAME = "localhost" - - -def gpus_for_rank(world_size): - """Multigpu tests are designed to simulate the multi nodes with multi - GPUs on each node. Nccl backend requires equal #GPUs in each process. - On a single node, all visible GPUs are evenly - divided to subsets, each process only uses a subset. - """ - visible_devices = list(range(torch.cuda.device_count())) - gpus_per_process = torch.cuda.device_count() // world_size - gpus_for_rank = [] - for rank in range(world_size): - gpus_for_rank.append( - visible_devices[rank * gpus_per_process : (rank + 1) * gpus_per_process] - ) - return gpus_for_rank - - -def simple_reduce_tests(rank, world_size): - tests = [ - ( - c10d.ReduceOp.SUM, - torch.tensor([rank + 1.0]), - torch.tensor([float(world_size * (world_size + 1) / 2)]), - ), - ( - c10d.ReduceOp.PRODUCT, - torch.tensor([rank + 1.0]), - torch.tensor([float(math.factorial(world_size))]), - ), - ( - c10d.ReduceOp.MIN, - torch.tensor([rank + 1.0]), - torch.tensor([1.0]), - ), - ( - c10d.ReduceOp.MAX, - torch.tensor([rank + 1.0]), - torch.tensor([world_size]), - ), - ] - - # Generate tests for BAND. - # The bit that is set changes in every iteration to check - # that the output changes accordingly. - for i in range(4): - vin = rank | (1 << i) - vout = 1 << i - tests.append( - ( - c10d.ReduceOp.BAND, - torch.tensor([vin], dtype=torch.int32), - torch.tensor([vout], dtype=torch.int32), - ), - ) - - # Generate tests for BOR. - # These emulate a larger world size per iteration by having every - # rank contribute multiple values that are pre-OR'ed. - for i in range(1, 5): - vin = reduce(operator.or_, [rank * i + j for j in range(i)]) - vout = reduce(operator.or_, range(world_size * i)) - tests.append( - ( - c10d.ReduceOp.BOR, - torch.tensor([vin], dtype=torch.int32), - torch.tensor([vout], dtype=torch.int32), - ), - ) - - # Generate tests for XOR. - # These emulate a larger world size per iteration by having every - # rank contribute multiple values that are pre-XOR'ed. - for i in range(1, 5): - vin = reduce(operator.xor, [rank * i + j for j in range(i)]) - vout = reduce(operator.xor, range(world_size * i)) - tests.append( - ( - c10d.ReduceOp.BXOR, - torch.tensor([vin], dtype=torch.int32), - torch.tensor([vout], dtype=torch.int32), - ), - ) - - return tests - - -def simple_coalesced_reduce_tests(rank, world_size): - return [ - ( - c10d.ReduceOp.SUM, - [torch.tensor([rank + 1]), torch.tensor([(rank + 1) ** 2])], - [ - torch.tensor([float(world_size * (world_size + 1) / 2)]), - torch.tensor( - [float(world_size * (world_size + 1) * (2 * world_size + 1) / 6)] - ), - ], - ), - ( - c10d.ReduceOp.PRODUCT, - [torch.tensor([rank + 1.0]), torch.tensor([rank + 2.0])], - [ - torch.tensor([float(math.factorial(world_size))]), - torch.tensor([float(math.factorial(world_size + 1))]), - ], - ), - ( - c10d.ReduceOp.MIN, - [torch.tensor([rank + x]) for x in [0.0, 1.0]], - [torch.tensor([0.0]), torch.tensor([1.0])], - ), - ( - c10d.ReduceOp.MAX, - [torch.tensor([rank + x]) for x in [1.0, 2.0]], - [torch.tensor([world_size]), torch.tensor([world_size + 1.0])], - ), - ] - - -def simple_multi_input_reduce_tests(rank, world_size): - return [ - ( - c10d.ReduceOp.SUM, - [torch.tensor([2 * rank + 0.0]), torch.tensor([2 * rank + 1.0])], - torch.tensor([float(world_size * (2 * world_size - 1))]), - ), - ( - c10d.ReduceOp.PRODUCT, - [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])], - torch.tensor([float(math.factorial(2 * world_size))]), - ), - ( - c10d.ReduceOp.MIN, - [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])], - torch.tensor([1.0]), - ), - ( - c10d.ReduceOp.MAX, - [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])], - torch.tensor([2 * world_size]), - ), - ] - - -class StoreTestBase(object): - def _create_store(self, i): - raise RuntimeError("not implemented") - - def _test_set_get(self, fs): - fs.add("key", 1) - fs.add("key", 2) - fs.add("key", 3) - fs.set("key0", "value0") - fs.add("key3", 1) - fs.set("key1", "value1") - fs.add("key3", 2) - fs.set("key2", "value2") - fs.add("key3", 3) - fs.add("key3", 4) - fs.add("key3", 5) - fs.add("key3", 6) - self.assertEqual(fs.num_keys(), self.num_keys_total) - self.assertEqual(b"6", fs.get("key")) - self.assertEqual(b"value0", fs.get("key0")) - self.assertEqual(b"value1", fs.get("key1")) - self.assertEqual(b"value2", fs.get("key2")) - self.assertEqual(b"21", fs.get("key3")) - - def test_set_get(self): - self._test_set_get(self._create_store()) - - def test_compare_set(self): - store = self._create_store() - missing_key_result = store.compare_set("key0", "wrong_old_value", "new_value0") - self.assertEqual(b"wrong_old_value", missing_key_result) - - store.set("key0", "value0") - self.assertEqual(b"value0", store.get("key0")) - old_value_result = store.compare_set("key0", "wrong_old_value", "new_value0") - self.assertEqual(b"value0", old_value_result) - self.assertEqual(b"value0", store.get("key0")) - new_value_result = store.compare_set("key0", "value0", "new_value0") - self.assertEqual(b"new_value0", new_value_result) - self.assertEqual(b"new_value0", store.get("key0")) - - # This is the number of keys used in test_set_get. Adding this as a class - # property instead of hardcoding in the test since some Store - # implementations will have differing number of keys. In the base case, - # there will be 5 keys: key, key0, key1, key2, key3. - @property - def num_keys_total(self): - return 5 - - -class FileStoreTest(TestCase, StoreTestBase): - def setUp(self): - super(FileStoreTest, self).setUp() - self.file = tempfile.NamedTemporaryFile(delete=False) - - def _create_store(self): - store = c10d.FileStore(self.file.name, 1) - store.set_timeout(timedelta(seconds=300)) - return store - -@skip_if_win32() -class HashStoreTest(TestCase, StoreTestBase): - def setUp(self): - super(HashStoreTest, self).setUp() - - def _create_store(self): - store = c10d.HashStore() - store.set_timeout(timedelta(seconds=300)) - return store - -class PrefixFileStoreTest(TestCase, StoreTestBase): - def setUp(self): - super(PrefixFileStoreTest, self).setUp() - self.file = tempfile.NamedTemporaryFile(delete=False) - self.filestore = c10d.FileStore(self.file.name, 1) - self.prefix = "test_prefix" - self.filestore.set_timeout(timedelta(seconds=300)) - - def _create_store(self): - return c10d.PrefixStore(self.prefix, self.filestore) - -class TCPStoreTest(TestCase, StoreTestBase): - def _create_store(self): - store = create_tcp_store() - store.set_timeout(timedelta(seconds=300)) - return store - - def test_address_already_in_use(self): - if sys.platform == "win32": - err_msg_reg = "Only one usage of each socket address*" - else: - err_msg_reg = "^Address already in use$" - with self.assertRaisesRegex(RuntimeError, err_msg_reg): - addr = DEFAULT_HOSTNAME - port = common.find_free_port() - - # Use noqa to silence flake8. - # Need to store in an unused variable here to ensure the first - # object is not destroyed before the second object is created. - store1 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 - store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 - - # The TCPStore has 6 keys in test_set_get. It contains the 5 keys added by - # the user and one additional key used for coordinate all the workers. - @property - def num_keys_total(self): - return 6 - - def _test_numkeys_delkeys(self, fs): - # We start off with one init key in the store to coordinate workers - self.assertEqual(fs.num_keys(), 1) - fs.add("key", 1) - fs.add("key", 2) - fs.add("key", 3) - fs.set("key0", "value0") - fs.add("key3", 1) - fs.set("key1", "value1") - self.assertEqual(fs.num_keys(), 5) - fs.delete_key("key") - self.assertEqual(fs.num_keys(), 4) - fs.set_timeout(timedelta(seconds=2)) - with self.assertRaises(RuntimeError): - fs.get("key") - fs.delete_key("key0") - fs.delete_key("key3") - self.assertEqual(fs.num_keys(), 2) - fs.set("key4", "value2") - self.assertEqual(fs.num_keys(), 3) - self.assertEqual(b"value1", fs.get("key1")) - self.assertEqual(b"value2", fs.get("key4")) - - def test_numkeys_delkeys(self): - self._test_numkeys_delkeys(self._create_store()) - - def _create_client(self, index, addr, port, world_size, messages): - try: - client_store = dist.TCPStore(addr, port, world_size, timeout=timedelta(seconds=10)) - self.assertEqual("value".encode(), client_store.get("key")) - client_store.set(f"new_key{index}", f"new_value{index}") - self.assertEqual(f"next_value{index}".encode(), - client_store.compare_set(f"new_key{index}", f"new_value{index}", f"next_value{index}")) - except Exception: - messages.put('Caught exception: \n{}exiting process with exit code: {}' - .format(traceback.format_exc(), MultiProcessTestCase.TEST_ERROR_EXIT_CODE)) - sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE) - - def _multi_worker_helper(self, world_size): - addr = DEFAULT_HOSTNAME - server_store = create_tcp_store(addr, world_size, wait_for_workers=False) - server_store.set("key", "value") - port = server_store.port - messages = mp.Queue() - processes = [] - num_proccesses = random.randint(3, 5) if world_size == -1 else world_size - for i in range(num_proccesses): - p = mp.Process(target=self._create_client, args=(i, addr, port, world_size, messages)) - processes.append(p) - p.start() - for p in processes: - p.join() - error_message = "" - while not messages.empty(): - error_message += messages.get() + "\n" - if any([p.exitcode != 0 for p in processes]): - raise RuntimeError(error_message) - - @unittest.skipIf( - IS_WINDOWS, "Skip test for windows due to multiprocessing library error when using windows spawn" - ) - def test_multi_worker_with_fixed_world_size(self): - self._multi_worker_helper(5) - - @unittest.skipIf( - IS_WINDOWS, "Skip test for windows due to multiprocessing library error when using windows spawn" - ) - def test_multi_worker_with_nonfixed_world_size(self): - self._multi_worker_helper(-1) - - -class PrefixTCPStoreTest(TestCase, StoreTestBase): - def setUp(self): - super(PrefixTCPStoreTest, self).setUp() - self.tcpstore = create_tcp_store() - self.prefix = "test_prefix" - self.tcpstore.set_timeout(timedelta(seconds=300)) - - def _create_store(self): - return c10d.PrefixStore(self.prefix, self.tcpstore) - - # The PrefixTCPStore has 6 keys in test_set_get. It contains the 5 keys - # added by the user and one additional key used for coordinate all the - # workers. - @property - def num_keys_total(self): - return 6 - - -class MyPythonStore(c10d.Store): - def __init__(self): - super(MyPythonStore, self).__init__() - self.store = dict() - - def set(self, key, value): - if not isinstance(key, string_classes): - raise AssertionError("Expected set to be called with string key") - if type(value) is not bytes: - raise AssertionError("Expected set to be called with bytes value") - self.store[key] = value - - def get(self, key): - value = self.store.get(key, b"") - if type(value) is not bytes: - raise AssertionError("Expected get to return bytes value") - return value - - def add(self, key, value): - new = int(self.store.get(key, 0)) + value - self.set(key, bytes(str(new).encode("utf-8"))) - return new - - -class PythonStoreTest(TestCase): - def setUp(self): - super(PythonStoreTest, self).setUp() - - def test_set_get(self): - # If we were to inherit from StoreTestBase and try to use - # its test_set_get function, we would exercise the Python - # API directly, instead of going through the C++ trampoline. - # We care about testing the C++ trampoline, so run the - # equivalent of StoreTestBase.test_set_get from C++. - # See `torch/csrc/distributed/c10d/init.cpp` for the definition - # of this test function. - c10d._test_python_store(MyPythonStore()) - - -class RendezvousTest(TestCase): - def test_unknown_handler(self): - with self.assertRaisesRegex(RuntimeError, "^No rendezvous handler"): - c10d.rendezvous("invalid://") - - -class RendezvousEnvTest(TestCase): - @retry_on_connect_failures - @requires_nccl() - def test_common_errors(self): - if torch.cuda.device_count() == 0: - raise unittest.SkipTest("No GPUs available, skipping test") - - vars = { - "WORLD_SIZE": "1", - "RANK": "0", - "MASTER_ADDR": "127.0.0.1", - "MASTER_PORT": str(common.find_free_port()), - } - - class Env(object): - def __init__(self, vars): - self.env_patcher = mock.patch.dict(os.environ, vars, clear=True) - - def __enter__(self): - self.env_patcher.start() - - def __exit__(self, type, value, traceback): - self.env_patcher.stop() - - def without(d, key): - d = d.copy() - d.pop(key) - return d - - def withouts(d, keys): - d = d.copy() - for key in keys: - d.pop(key) - return d - - with Env(without(vars, "WORLD_SIZE")): - self.assertEqual(None, os.environ.get("WORLD_SIZE")) - with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"): - gen = c10d.rendezvous("env://") - next(gen) - c10d.init_process_group(backend="nccl", world_size=1) - self.assertEqual(c10d.get_rank(), 0) - self.assertEqual(c10d.get_world_size(), 1) - c10d.destroy_process_group() - - with Env(without(vars, "RANK")): - self.assertEqual(None, os.environ.get("RANK")) - with self.assertRaisesRegex(ValueError, "RANK expected"): - gen = c10d.rendezvous("env://") - next(gen) - c10d.init_process_group(backend="nccl", rank=0) - self.assertEqual(c10d.get_rank(), 0) - self.assertEqual(c10d.get_world_size(), 1) - c10d.destroy_process_group() - - with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): - self.assertEqual(None, os.environ.get("RANK")) - self.assertEqual(None, os.environ.get("WORLD_SIZE")) - c10d.init_process_group(backend="nccl", rank=0, world_size=1) - self.assertEqual(c10d.get_rank(), 0) - self.assertEqual(c10d.get_world_size(), 1) - c10d.destroy_process_group() - - with Env(vars): - c10d.init_process_group(backend="nccl") - self.assertEqual(c10d.get_rank(), 0) - self.assertEqual(c10d.get_world_size(), 1) - c10d.destroy_process_group() - - with Env(without(vars, "MASTER_ADDR")): - self.assertEqual(None, os.environ.get("MASTER_ADDR")) - with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"): - gen = c10d.rendezvous("env://") - next(gen) - - with Env(without(vars, "MASTER_PORT")): - self.assertEqual(None, os.environ.get("MASTER_PORT")) - with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"): - gen = c10d.rendezvous("env://") - next(gen) - - with Env(without(vars, "WORLD_SIZE")): - self.assertEqual(None, os.environ.get("WORLD_SIZE")) - gen = c10d.rendezvous("env://?world_size={}".format(1)) - _, _, size = next(gen) - self.assertEqual(size, 1) - - with Env(without(vars, "RANK")): - self.assertEqual(None, os.environ.get("RANK")) - gen = c10d.rendezvous("env://?rank={}".format(0)) - _, rank, _ = next(gen) - self.assertEqual(rank, 0) - - with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): - self.assertEqual(None, os.environ.get("RANK")) - self.assertEqual(None, os.environ.get("WORLD_SIZE")) - gen = c10d.rendezvous("env://?rank={}&world_size={}".format(0, 1)) - _, rank, size = next(gen) - self.assertEqual(rank, 0) - self.assertEqual(size, 1) - - @retry_on_connect_failures - def test_nominal(self): - os.environ["WORLD_SIZE"] = "1" - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = str(common.find_free_port()) - - # Single rank - os.environ["RANK"] = "0" - gen0 = c10d.rendezvous("env://") - store0, rank0, size0 = next(gen0) - self.assertEqual(0, rank0) - self.assertEqual(1, size0) - - store0.set("key0", "value0") - - # check with get - self.assertEqual(b"value0", store0.get("key0")) - - @retry_on_connect_failures - def test_logging_init(self): - os.environ["WORLD_SIZE"] = "1" - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = str(common.find_free_port()) - os.environ["RANK"] = "0" - - previous_handlers = logging.root.handlers - - c10d.init_process_group(backend="gloo", init_method="env://") - - current_handlers = logging.root.handlers - self.assertEqual(len(previous_handlers), len(current_handlers)) - for current, previous in zip(current_handlers, previous_handlers): - self.assertEqual(current, previous) - - c10d.destroy_process_group() - - -class RendezvousFileTest(TestCase): - def test_common_errors(self): - with self.assertRaisesRegex(ValueError, "path missing"): - gen = c10d.rendezvous("file://?rank=0&world_size=1") - next(gen) - with self.assertRaisesRegex(ValueError, "rank parameter missing"): - gen = c10d.rendezvous("file:///tmp/foo?world_size=1") - next(gen) - with self.assertRaisesRegex(ValueError, "size parameter missing"): - gen = c10d.rendezvous("file:///tmp/foo?rank=0") - next(gen) - - def test_nominal(self): - with tempfile.NamedTemporaryFile(delete=False) as file: - url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' - gen0 = c10d.rendezvous(url + "&rank=0") - store0, rank0, size0 = next(gen0) - self.assertEqual(0, rank0) - self.assertEqual(2, size0) - gen1 = c10d.rendezvous(url + "&rank=1") - store1, rank1, size1 = next(gen1) - self.assertEqual(1, rank1) - self.assertEqual(2, size1) - - # Set value on both stores - store0.set("key0", "value0") - store1.set("key1", "value1") - - # Cross check with get - self.assertEqual(b"value0", store1.get("key0")) - self.assertEqual(b"value1", store0.get("key1")) - - -@skip_if_win32() -class RendezvousTCPTest(TestCase): - def create_tcp_url(self): - addr = DEFAULT_HOSTNAME - port = common.find_free_port() - url = "tcp://%s:%d?world_size=%d" % (addr, port, 1) - return url - - def test_common_errors(self): - with self.assertRaisesRegex(ValueError, "port number missing"): - gen = c10d.rendezvous("tcp://127.0.0.1?rank=0&world_size=1") - next(gen) - with self.assertRaisesRegex(ValueError, "rank parameter missing"): - gen = c10d.rendezvous("tcp://127.0.0.1:23456?world_size=1") - next(gen) - with self.assertRaisesRegex(ValueError, "size parameter missing"): - gen = c10d.rendezvous("tcp://127.0.0.1:23456?rank=0") - next(gen) - - @retry_on_connect_failures - def test_nominal(self): - url = self.create_tcp_url() - gen0 = c10d.rendezvous(url + "&rank=0") - store0, rank0, size0 = next(gen0) - self.assertEqual(0, rank0) - self.assertEqual(1, size0) - - # Set value on the single store - store0.set("key0", "value0") - - # check with get - self.assertEqual(b"value0", store0.get("key0")) - - @retry_on_connect_failures(connect_errors=(CONNECT_TIMEOUT, ADDRESS_IN_USE)) - def test_tcp_store_timeout_set(self): - url = self.create_tcp_url() - test_store_timeout = timedelta(seconds=10) - gen0 = c10d.rendezvous(url + "&rank=0", timeout=test_store_timeout) - store0, rank0, size0 = next(gen0) - # this should time out in 10s. If the timeout passed into rendezvous was - # not respected, it will take much longer to timeout. - start = time.time() - with self.assertRaisesRegex(RuntimeError, "Timeout"): - store0.get("nonexistant key") - - end = time.time() - time_diff = end - start - self.assertGreater(test_store_timeout.seconds * 10, time_diff) - - -class TimeoutTest(TestCase): - def _test_store_timeout(self, backend, init_method, c2p): - try: - c10d.distributed_c10d.init_process_group( - backend=backend, - init_method=init_method, - world_size=1, - rank=0, - timeout=timedelta(seconds=1), - ) - default_store = c10d.distributed_c10d._get_default_store() - tik = time.time() - with self.assertRaisesRegex(RuntimeError, "Timeout"): - default_store.get("nonexistent key") - tok = time.time() - c10d.destroy_process_group() - c2p.append(float(tok - tik)) - except RuntimeError as e: - # catch "Address already in use" error and report it to the main - # thread - c2p.append(e) - - def _init_methods(self): - f = tempfile.NamedTemporaryFile(delete=False) - if sys.platform == "win32": - yield "file:///%s" % f.name.replace("\\", "/") - f.close() - else: - yield "file://%s" % f.name - f.close() - yield "tcp://127.0.0.1:%d" % common.find_free_port() - - def _test_default_store_timeout(self, backend): - for init_method in self._init_methods(): - c2p = [] - t = threading.Thread( - target=self._test_store_timeout, args=(backend, init_method, c2p) - ) - t.daemon = True - t.start() - t.join(5) - - self.assertEqual(1, len(c2p)) - if isinstance(c2p[0], float): - # waiting time should be 1s, use 3s to rule out false alarm - self.assertGreater(3, c2p[0]) - elif isinstance(c2p[0], RuntimeError): - # let @retry_on_connect_failures handle the error - raise c2p[0] - else: - raise RuntimeError("Unexpected type {}".format(type(c2p[0]))) - - @requires_nccl() - @retry_on_connect_failures - def test_default_store_timeout_nccl(self): - if torch.cuda.device_count() == 0: - raise unittest.SkipTest("No GPUs available, skipping test") - self._test_default_store_timeout("nccl") - - @requires_gloo() - @retry_on_connect_failures - def test_default_store_timeout_gloo(self): - self._test_default_store_timeout("gloo") - - -@requires_gloo() -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) -class ProcessGroupGlooTest(MultiProcessTestCase): - def setUp(self): - super(ProcessGroupGlooTest, self).setUp() - - # For Windows platform, Python does not support fork, change it to spawn here. - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() - - def opts(self, threads=2): - opts = c10d.ProcessGroupGloo._Options() - opts._timeout = 5.0 - opts._devices = [create_device(interface=LOOPBACK)] - opts._threads = threads - return opts - - def test_multi_device_constructor(self): - store = c10d.FileStore(self.file_name, self.world_size) - opts = c10d.ProcessGroupGloo._Options() - opts._timeout = 5.0 - opts._devices = [ - create_device(interface=LOOPBACK), - create_device(interface=LOOPBACK), - ] - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) - - # Execute 2x the number of operations to ensure we use every device. - for work in [pg.allreduce(torch.ones(i + 1)) for i in range(4)]: - work.wait() - - def test_empty_tensors(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - xs = [torch.FloatTensor([])] - pg.broadcast(xs).wait() - self.assertEqual(0, xs[0].numel()) - - def test_broadcast_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1], dtype=torch.float32) - t2 = torch.zeros([1], dtype=torch.float64) - t3 = torch.zeros([2], dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.BroadcastOptions() - opts.rootRank = -1 - opts.rootTensor = 0 - pg.broadcast([t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.BroadcastOptions() - opts.rootRank = self.world_size - opts.rootTensor = 0 - pg.broadcast([t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root tensor"): - opts = c10d.BroadcastOptions() - opts.rootRank = self.rank - opts.rootTensor = -1 - pg.broadcast([t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root tensor"): - opts = c10d.BroadcastOptions() - opts.rootRank = self.rank - opts.rootTensor = 1 - pg.broadcast([t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root tensor"): - opts = c10d.BroadcastOptions() - opts.rootRank = self.rank - opts.rootTensor = 0 - pg.broadcast([], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor type"): - opts = c10d.BroadcastOptions() - opts.rootRank = self.rank - opts.rootTensor = 0 - pg.broadcast([t1, t2], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - opts = c10d.BroadcastOptions() - opts.rootRank = self.rank - opts.rootTensor = 0 - pg.broadcast([t1, t3], opts) - - def _test_broadcast_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - def broadcast(xs, rootRank, rootTensor): - opts = c10d.BroadcastOptions() - opts.rootRank = rootRank - opts.rootTensor = rootTensor - work = pg.broadcast(xs, opts) - work.wait() - - # Every rank is root once - for i in range(self.world_size): - # Run with 1 input tensor - x = fn(torch.tensor([self.rank])) - broadcast([x], i, 0) - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.tensor([i]), x) - - # Run with 2 input tensors - num = 2 - for j in range(num): - xs = [ - fn(torch.tensor([self.rank * num + 0.0])), - fn(torch.tensor([self.rank * num + 1.0])), - ] - - broadcast(xs, i, j) - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.tensor([i * num + j]), xs[0]) - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.tensor([i * num + j]), xs[1]) - - # Test overloaded convenience function - x = torch.tensor([self.rank + 1.0]) - work = pg.broadcast(x, root=0) - work.wait() - self.assertEqual(torch.tensor([1.0]), x) - - def test_broadcast_basics(self): - self._test_broadcast_basics(lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_broadcast_basics_cuda(self): - self._test_broadcast_basics(lambda t: t.clone().cuda()) - - def _test_broadcast_stress(self, inputs): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - work_handles = [ - pg.broadcast(inputs[i], root=(i % self.world_size)) - for i in range(len(inputs)) - ] - for i, work_handle in enumerate(work_handles): - work_handle.wait() - self.assertEqual( - torch.tensor([(i * self.world_size) + (i % self.world_size)]), - inputs[i], - msg=("Mismatch in iteration %d" % i), - ) - - def test_broadcast_stress(self): - inputs = [torch.tensor([i * self.world_size + self.rank]) for i in range(1000)] - self._test_broadcast_stress(inputs) - - @skip_if_lt_x_gpu(2) - def test_broadcast_stress_cuda(self): - inputs = [ - torch.tensor([i * self.world_size + self.rank]).cuda() for i in range(1000) - ] - self._test_broadcast_stress(inputs) - - def test_allreduce_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1], dtype=torch.float32) - t2 = torch.zeros([1], dtype=torch.float64) - t3 = torch.zeros([2], dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): - opts = c10d.AllreduceOptions() - pg.allreduce([], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor type"): - opts = c10d.AllreduceOptions() - pg.allreduce([t1, t2], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - opts = c10d.AllreduceOptions() - pg.allreduce([t1, t3], opts) - - def _test_allreduce_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - # Single input tests - tests = simple_reduce_tests(self.rank, self.world_size) - for (op, input, output) in tests: - opts = c10d.AllreduceOptions() - opts.reduceOp = op - tensor = fn(input) - work = pg.allreduce([tensor], opts) - work.wait() - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(output, tensor) - - # Multi input tests - tests = simple_multi_input_reduce_tests(self.rank, self.world_size) - for (op, inputs, output) in tests: - opts = c10d.AllreduceOptions() - opts.reduceOp = op - tensors = [fn(input) for input in inputs] - work = pg.allreduce(tensors, opts) - work.wait() - for tensor in tensors: - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(output, tensor) - - # Test overloaded convenience function (defaults to using sum) - x = fn(torch.tensor([self.rank + 1.0])) - work = pg.allreduce(x) - work.wait() - self.assertEqual( - torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]), x - ) - - def test_allreduce_basics(self): - self._test_allreduce_basics(lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_allreduce_basics_cuda(self): - self._test_allreduce_basics(lambda t: t.clone().cuda()) - - def _test_allreduce_stress(self, inputs): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - work_handles = [pg.allreduce(inputs[i]) for i in range(len(inputs))] - for i, work_handle in enumerate(work_handles): - work_handle.wait() - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - torch.tensor( - [ - (i * self.world_size) - + (self.world_size * (self.world_size - 1) / 2) - ] - ), - inputs[i], - msg=("Mismatch in iteration %d" % i), - ) - - def test_allreduce_stress(self): - inputs = [torch.tensor([i + self.rank]) for i in range(1000)] - self._test_allreduce_stress(inputs) - - @skip_if_lt_x_gpu(2) - def test_allreduce_stress_cuda(self): - inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] - self._test_allreduce_stress(inputs) - - def test_allreduce_coalesced_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros(1, dtype=torch.float32) - t2 = torch.zeros(1, dtype=torch.float64) - t3 = torch.sparse_coo_tensor([[0]], [1], size=(1,)) - - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): - opts = c10d.AllreduceCoalescedOptions() - pg.allreduce_coalesced([], opts) - - with self.assertRaisesRegex(ValueError, "tensors must all have the same type"): - opts = c10d.AllreduceCoalescedOptions() - pg.allreduce_coalesced([t1, t2], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor layout at index"): - opts = c10d.AllreduceCoalescedOptions() - pg.allreduce_coalesced([t1, t3], opts) - - with self.assertRaisesRegex(ValueError, "unsupported layout"): - opts = c10d.AllreduceCoalescedOptions() - pg.allreduce_coalesced([t3, t3.clone()], opts) - - @skip_if_lt_x_gpu(1) - def test_allreduce_coalesced_checks_cuda(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros(1, dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "unsupported device type"): - opts = c10d.AllreduceCoalescedOptions() - pg.allreduce_coalesced([t1.cuda(), t1.cuda()], opts) - - def _test_allreduce_coalesced_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - test_cases = simple_coalesced_reduce_tests(self.rank, self.world_size) - for op, inputs, outputs in test_cases: - opts = c10d.AllreduceCoalescedOptions() - opts.reduceOp = op - tensors = [fn(x) for x in inputs] - work = pg.allreduce_coalesced(tensors, opts) - work.wait() - for result_tensor, expected in zip(tensors, outputs): - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(result_tensor, expected) - - def test_allreduce_coalesced_basics(self): - self._test_allreduce_coalesced_basics(lambda t: t.clone()) - - def _test_allreduce_coalesced_stress(self, inputs): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - work_handles = [pg.allreduce_coalesced(input) for input in inputs] - for i, work_handle in enumerate(work_handles): - work_handle.wait() - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - 2 - * [ - torch.tensor( - [ - (i * self.world_size) - + (self.world_size * (self.world_size - 1) / 2) - ] - ) - ], - inputs[i], - msg="Mismatch in interation {}".format(i), - ) - - def test_allreduce_coalesced_stress(self): - inputs = [2 * [torch.tensor([i + self.rank])] for i in range(1000)] - self._test_allreduce_coalesced_stress(inputs) - - def test_sparse_allreduce_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1]) - t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,)) - t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,)) - - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): - opts = c10d.AllreduceOptions() - pg.allreduce([], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor layout"): - opts = c10d.AllreduceOptions() - pg.allreduce([t1, t2], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - opts = c10d.AllreduceOptions() - pg.allreduce([t2, t3], opts) - - # Sparse allreduce only works with c10d.ReduceOp.SUM. - for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]: - with self.assertRaisesRegex(ValueError, "unsupported reduction operation"): - opts = c10d.AllreduceOptions() - opts.reduceOp = op - pg.allreduce([t3], opts) - - def _test_sparse_allreduce_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - for num_inputs_per_rank in [1, 2]: - tests = simple_sparse_reduce_tests( - self.rank, self.world_size, num_inputs=num_inputs_per_rank - ) - for (inputs, outputs) in tests: - tensors = [fn(input) for input in inputs] - work = pg.allreduce(tensors) - work.wait() - self.assertEqual(tensors, outputs) - self.assertEqual(work.result(), outputs) - - def test_sparse_allreduce_basics(self): - self._test_sparse_allreduce_basics(lambda t: t) - - @skip_if_lt_x_gpu(2) - def test_sparse_allreduce_basics_cuda(self): - self._test_sparse_allreduce_basics(lambda t: t.clone().cuda()) - - def test_scatter_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1], dtype=torch.float32) - t2 = torch.zeros([1], dtype=torch.float64) - t3 = torch.zeros([2], dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.ScatterOptions() - opts.rootRank = -1 - pg.scatter([t1], [], opts) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.ScatterOptions() - opts.rootRank = self.world_size - pg.scatter([t1], [], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element output tensor list" - ): - opts = c10d.ScatterOptions() - opts.rootRank = 0 - pg.scatter([], [], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element output tensor list" - ): - opts = c10d.ScatterOptions() - opts.rootRank = 0 - pg.scatter([t1, t1], [], opts) - - with self.assertRaisesRegex(ValueError, "requires a single-element input list"): - opts = c10d.ScatterOptions() - opts.rootRank = self.rank - pg.scatter([t1], [], opts) - - with self.assertRaisesRegex(ValueError, "requires a single-element input list"): - opts = c10d.ScatterOptions() - opts.rootRank = self.rank - pg.scatter([t1], [[t1] * self.world_size, [t1] * self.world_size], opts) - - desired_list_size = self.world_size - incorrect_list_size = self.world_size - 1 - err_str = "Incorrect input list size {}. Input list size should be {}" - with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) - ): - opts = c10d.ScatterOptions() - opts.rootRank = self.rank - pg.scatter([t1], [[t1] * incorrect_list_size], opts) - - incorrect_list_size = self.world_size + 1 - with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) - ): - opts = c10d.ScatterOptions() - opts.rootRank = self.rank - pg.scatter([t1], [[t1] * incorrect_list_size], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor type"): - opts = c10d.ScatterOptions() - opts.rootRank = self.rank - pg.scatter([t1], [[t2] * self.world_size], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - opts = c10d.ScatterOptions() - opts.rootRank = self.rank - pg.scatter([t1], [[t3] * self.world_size], opts) - - with self.assertRaisesRegex(ValueError, "requires empty input on non-root"): - opts = c10d.ScatterOptions() - opts.rootRank = (self.rank + 1) % self.world_size - pg.scatter([t1], [[t1] * self.world_size], opts) - - def _test_scatter_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - # Preallocate tensors for input/output - input = [fn(torch.tensor([self.rank])) for _ in range(self.world_size)] - outputs = [fn(torch.tensor([-1])) for _ in range(self.world_size)] - - # Take turns being the scatter root and accumulate work items - work = [] - for i in range(self.world_size): - opts = c10d.ScatterOptions() - opts.rootRank = i - if i == self.rank: - work.append(pg.scatter([outputs[i]], [input], opts)) - else: - work.append(pg.scatter([outputs[i]], [], opts)) - - # Wait for work to complete - for i in range(self.world_size): - work[i].wait() - self.assertEqual(torch.tensor([i]), outputs[i]) - - def test_scatter_basics(self): - self._test_scatter_basics(lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_scatter_basics_cuda(self): - self._test_scatter_basics(lambda t: t.clone().cuda()) - - def _test_scatter_stress(self, inputs, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - outputs = [ - [fn(torch.tensor([-1])) for _ in range(self.world_size)] - for _ in range(len(inputs)) - ] - work_handles = [] - for i in range(len(inputs)): - for root in range(self.world_size): - opts = c10d.ScatterOptions() - opts.rootRank = root - if root == self.rank: - work = pg.scatter( - [outputs[i][root]], [[fn(e) for e in inputs[i]]], opts - ) - else: - work = pg.scatter([outputs[i][root]], [], opts) - work_handles.append(work) - - for i, work_handle in enumerate(work_handles): - work_handle.wait() - iter = i // self.world_size - root = i % self.world_size - - self.assertEqual( - torch.tensor([iter + root]), - outputs[iter][root], - msg=("Mismatch in iteration %d for rank %d" % (iter, root)), - ) - - def test_scatter_stress(self): - inputs = [ - [torch.tensor([i + self.rank]) for _ in range(self.world_size)] - for i in range(1000) - ] - self._test_scatter_stress(inputs, lambda t: t.clone()) - - @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/15963") - @skip_if_lt_x_gpu(2) - def test_scatter_stress_cuda(self): - inputs = [ - [torch.tensor([i + self.rank]) for _ in range(self.world_size)] - for i in range(1000) - ] - self._test_scatter_stress(inputs, lambda t: t.clone().cuda()) - - def test_gather_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1], dtype=torch.float32) - t2 = torch.zeros([1], dtype=torch.float64) - t3 = torch.zeros([2], dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.GatherOptions() - opts.rootRank = -1 - pg.gather([], [t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.GatherOptions() - opts.rootRank = self.world_size - pg.gather([], [t1], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element input tensor list" - ): - opts = c10d.GatherOptions() - opts.rootRank = 0 - pg.gather([], [], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element input tensor list" - ): - opts = c10d.GatherOptions() - opts.rootRank = 0 - pg.gather([], [t1, t1], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element output list" - ): - opts = c10d.GatherOptions() - opts.rootRank = self.rank - pg.gather([], [t1], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element output list" - ): - opts = c10d.GatherOptions() - opts.rootRank = self.rank - pg.gather([[t1] * self.world_size, [t1] * self.world_size], [t1], opts) - - desired_list_size = self.world_size - incorrect_list_size = self.world_size - 1 - err_str = "Incorrect output list size {}. Output list size should be {}" - with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) - ): - opts = c10d.GatherOptions() - opts.rootRank = self.rank - pg.gather([[t1] * incorrect_list_size], [t1], opts) - - incorrect_list_size = self.world_size + 1 - with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) - ): - opts = c10d.GatherOptions() - opts.rootRank = self.rank - pg.gather([[t1] * incorrect_list_size], [t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor type"): - opts = c10d.GatherOptions() - opts.rootRank = self.rank - pg.gather([[t2] * self.world_size], [t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - opts = c10d.GatherOptions() - opts.rootRank = self.rank - pg.gather([[t3] * self.world_size], [t1], opts) - - with self.assertRaisesRegex(ValueError, "requires empty output on non-root"): - opts = c10d.GatherOptions() - opts.rootRank = (self.rank + 1) % self.world_size - pg.gather([[t1] * self.world_size], [t1], opts) - - def _test_gather_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - # Preallocate tensors for input/output - input = [fn(torch.tensor([self.rank]))] - outputs = [fn(torch.tensor([-1])) for _ in range(self.world_size)] - - # Take turns being the gather root and accumulate work items - work = [] - for i in range(self.world_size): - opts = c10d.GatherOptions() - opts.rootRank = i - if i == self.rank: - work.append(pg.gather([outputs], input, opts)) - else: - work.append(pg.gather([], input, opts)) - - # Wait for work to complete - expected = [torch.tensor([rank]) for rank in range(self.world_size)] - for i in range(self.world_size): - work[i].wait() - if i == self.rank: - self.assertEqual(expected, outputs) - - def test_gather_basics(self): - self._test_gather_basics(lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_gather_basics_cuda(self): - self._test_gather_basics(lambda t: t.clone().cuda()) - - def _test_gather_stress(self, inputs, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - work_handles = [] - outputs = [ - [[fn(torch.tensor([-1])) for _ in range(self.world_size)]] - for _ in range(len(inputs)) - ] - expected_outputs = [ - [[torch.tensor([i + j]) for j in range(self.world_size)]] - for i in range(len(inputs)) - ] - for i in range(len(inputs)): - for root in range(self.world_size): - opts = c10d.GatherOptions() - opts.rootRank = root - if root == self.rank: - work = pg.gather(outputs[i], [fn(inputs[i])], opts) - else: - work = pg.gather([], [fn(inputs[i])], opts) - work_handles.append(work) - - for i, work_handle in enumerate(work_handles): - work_handle.wait() - iter = i // self.world_size - root = i % self.world_size - if root == self.rank: - self.assertEqual( - expected_outputs[iter], - outputs[iter], - msg=("Mismatch in iteration %d for root %d" % (iter, root)), - ) - - def test_gather_stress(self): - inputs = [torch.tensor([i + self.rank]) for i in range(1000)] - self._test_gather_stress(inputs, lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_gather_stress_cuda(self): - inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] - self._test_gather_stress(inputs, lambda t: t.clone().cuda()) - - def test_allgather_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1], dtype=torch.float32) - t2 = torch.zeros([1], dtype=torch.float64) - t3 = torch.zeros([2], dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "requires non-empty input tensor list"): - pg.allgather([], []) - - with self.assertRaisesRegex( - ValueError, "requires input/output tensor lists to have the same length" - ): - pg.allgather([], [t1]) - - with self.assertRaisesRegex( - ValueError, "requires input/output tensor lists to have the same length" - ): - pg.allgather([[t1] * self.world_size, [t1] * self.world_size], [t1]) - - with self.assertRaisesRegex(ValueError, "invalid output tensor list"): - pg.allgather([[t1] * (self.world_size - 1)], [t1]) - - with self.assertRaisesRegex(ValueError, "invalid output tensor list"): - pg.allgather([[t1] * (self.world_size + 1)], [t1]) - - with self.assertRaisesRegex(ValueError, "invalid tensor type"): - pg.allgather( - [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t2] - ) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - pg.allgather( - [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t3] - ) - - with self.assertRaisesRegex(ValueError, "invalid tensor type"): - pg.allgather([([t1, t2] * (self.world_size))[: self.world_size]], [t1]) - - with self.assertRaisesRegex(ValueError, "invalid tensor size"): - pg.allgather([([t1, t3] * (self.world_size))[: self.world_size]], [t1]) - - def _test_allgather_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - # Run with N input tensor per rank - for n in [1, 2, 3]: - input = [fn(torch.tensor([n * self.rank + i])) for i in range(n)] - output = [ - [fn(torch.tensor([-1])) for _ in range(n * self.world_size)] - for _ in range(n) - ] - expected_output = [ - [torch.tensor([i]) for i in range(n * self.world_size)] - for _ in range(n) - ] - work = pg.allgather(output, input) - work.wait() - self.assertEqual(expected_output, output) - - def test_allgather_basics(self): - self._test_allgather_basics(lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_allgather_basics_cuda(self): - self._test_allgather_basics(lambda t: t.clone().cuda()) - - def _test_allgather_stress(self, inputs, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - work_handles = [] - outputs = [ - [[fn(torch.tensor([-1])) for _ in range(self.world_size)]] - for _ in range(len(inputs)) - ] - expected_outputs = [ - [[torch.tensor([i + j]) for j in range(self.world_size)]] - for i in range(len(inputs)) - ] - for i in range(len(inputs)): - work = pg.allgather(outputs[i], [fn(inputs[i])]) - work_handles.append(work) - - for i, work_handle in enumerate(work_handles): - work_handle.wait() - self.assertEqual( - expected_outputs[i], - outputs[i], - msg=("Mismatch in iteration %d" % i), - ) - - def test_allgather_stress(self): - inputs = [torch.tensor([i + self.rank]) for i in range(1000)] - self._test_allgather_stress(inputs, lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_allgather_stress_cuda(self): - inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] - self._test_allgather_stress(inputs, lambda t: t.clone().cuda()) - - def test_allgather_coalesced_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - dummy_input = [torch.zeros([1], dtype=torch.float32)] - dummy_output_lists = [ - [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size) - ] - - # One of output tensors does not match input list. - dummy_output_lists[0] = [torch.zeros([0], dtype=torch.float32)] - with self.assertRaisesRegex( - ValueError, "invalid size of output tensor at index 0" - ): - c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) - - # One of output tensors does not match input list. - dummy_output_lists[0] = [torch.zeros([1], dtype=torch.float64)] - with self.assertRaisesRegex(ValueError, "invalid tensor type at index 0"): - c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) - - # Output lists have too many elements - dummy_output_lists = [ - [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size + 1) - ] - with self.assertRaisesRegex( - ValueError, "output lists should be equal to world size" - ): - c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) - - # Output is not a list of lists. - dummy_output_lists = [torch.zeros([0], dtype=torch.float32)] - with self.assertRaisesRegex( - RuntimeError, "Invalid function argument.*output_tensor_lists" - ): - c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) - - def test_reduce_checks(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - t1 = torch.zeros([1], dtype=torch.float32) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.ReduceOptions() - opts.rootRank = -1 - opts.rootTensor = 0 - pg.reduce([t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root rank"): - opts = c10d.ReduceOptions() - opts.rootRank = self.world_size - opts.rootTensor = 0 - pg.reduce([t1], opts) - - with self.assertRaisesRegex(ValueError, "invalid root tensor"): - opts = c10d.ReduceOptions() - opts.rootRank = self.rank - opts.rootTensor = 1 - pg.reduce([t1], opts) - - with self.assertRaisesRegex( - ValueError, "requires a single-element tensor list" - ): - opts = c10d.ReduceOptions() - opts.rootRank = self.rank - opts.rootTensor = 0 - pg.reduce([t1, t1], opts) - - def _test_reduce_basics(self, fn): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - for (op, input, output) in simple_reduce_tests(self.rank, self.world_size): - for root in range(self.world_size): - opts = c10d.ReduceOptions() - opts.reduceOp = op - opts.rootRank = root - tmp = fn(input) - work = pg.reduce([tmp], opts) - work.wait() - if root == self.rank: - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(output, tmp) - - def test_reduce_basics(self): - self._test_reduce_basics(lambda t: t.clone()) - - @skip_if_lt_x_gpu(2) - def test_reduce_basics_cuda(self): - self._test_reduce_basics(lambda t: t.clone().cuda()) - - def _test_reduce_stress(self, inputs): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, self.opts(threads=8) - ) - work_handles = [] - outputs = [] - for i in range(len(inputs)): - for root in range(self.world_size): - opts = c10d.ReduceOptions() - opts.rootRank = root - tmp = inputs[i].clone() - outputs.append(tmp) - work = pg.reduce([tmp], opts) - work_handles.append(work) - - for i, work_handle in enumerate(work_handles): - work_handle.wait() - iter = i // self.world_size - root = i % self.world_size - if root == self.rank: - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - torch.tensor( - [ - (iter * self.world_size) - + (self.world_size * (self.world_size - 1) / 2) - ] - ), - outputs[i], - msg=("Mismatch in iteration %d with root rank %d" % (iter, root)), - ) - - def test_reduce_stress(self): - inputs = [torch.tensor([i + self.rank]) for i in range(1000)] - self._test_reduce_stress(inputs) - - @skip_if_lt_x_gpu(2) - def test_reduce_stress_cuda(self): - inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] - self._test_reduce_stress(inputs) - - def test_send_recv_all_to_all(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - # Preallocate tensors for input/output - inputs = [torch.tensor([self.rank]) for _ in range(self.world_size)] - outputs = [torch.tensor([-1]) for _ in range(self.world_size)] - - # Issue sends - send_work = [] - for i in range(self.world_size): - if i == self.rank: - continue - send_work.append(pg.send([inputs[i]], i, 0)) - - # Issue recvs - recv_work = [] - for i in range(self.world_size): - if i == self.rank: - continue - recv_work.append(pg.recv([outputs[i]], i, 0)) - - # Wait for sends to complete - for work in send_work: - work.wait() - self.assertTrue(work.is_completed()) - - # Wait for recvs to complete - for work in recv_work: - work.wait() - self.assertTrue(work.is_completed()) - - # Test that every output other than our own contains the respective rank - for i in range(self.world_size): - if i == self.rank: - continue - self.assertEqual(torch.tensor([i]), outputs[i]) - - def test_barrier_implies_wait(self): - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) - - # Kick off allreduce operations - size = (100, 100) - num = 16 - tensors = [torch.full(size, float(i)) for i in range(num)] - for tensor in tensors: - # Note: leak the returned work handle - pg.allreduce(tensor) - - # Barrier should ensure all previous work has completed - pg.barrier().wait() - - for i, tensor in enumerate(tensors): - self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) - - @skip_if_win32() - def test_round_robin(self): - num_process_groups = 2 - store = c10d.FileStore(self.file_name, self.world_size) - pg = c10d._round_robin_process_groups( - [ - c10d.ProcessGroupGloo( - c10d.PrefixStore(str(i), store), self.rank, self.world_size, self.opts() - ) - for i in range(num_process_groups) - ] - ) - - # Run a few collectives so that we have called each process group - for _ in range(num_process_groups + 1): - tensor = torch.full([100, 100], float(self.rank)) - pg.broadcast(tensor, root=0).wait() - self.assertEqual(torch.full([100, 100], 0.0), tensor) - - @skip_if_win32() - def test_round_robin_create_destroy(self): - store = c10d.FileStore(self.file_name, self.world_size) - - def create(num, prefix): - return c10d._round_robin_process_groups( - [ - c10d.ProcessGroupGloo( - c10d.PrefixStore("%s/%d" % (prefix, i), store), - self.rank, - self.world_size, - self.opts() - ) - for i in range(num) - ] - ) - - # Run create/use/destroy twice - for i in range(2): - num_process_groups = 2 - pg = create(num=num_process_groups, prefix=i) - for _ in range(3): - tensor = torch.ones([10, 10]) - pg.allreduce(tensor).wait() - self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor) - del pg - - -class ProcessGroupNCCLNoGPUTest(TestCase): - MAIN_PROCESS_RANK = 0 - - def setUp(self): - self.rank = self.MAIN_PROCESS_RANK - self.world_size = 1 - self.file = tempfile.NamedTemporaryFile(delete=False) - self.num_gpus = torch.cuda.device_count() - if self.num_gpus > 0: - raise unittest.SkipTest("GPUs are available, skipping test") - - def tearDown(self): - pass - - @requires_nccl() - def test_init_no_gpus(self): - store = c10d.FileStore(self.file.name, self.world_size) - with self.assertRaisesRegex( - RuntimeError, "ProcessGroupNCCL is only supported with GPUs, no GPUs found!" - ): - c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) -class ProcessGroupNCCLTest(TestCase): - MAIN_PROCESS_RANK = 0 - - def setUp(self): - self.rank = self.MAIN_PROCESS_RANK - self.world_size = 1 - self.file = tempfile.NamedTemporaryFile(delete=False) - self.num_gpus = torch.cuda.device_count() - if self.num_gpus < 2: - raise unittest.SkipTest("NCCL test requires 2+ GPUs") - - # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests - # that use NCCL_BLOCKING_WAIT will test it as expected. - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" - - def tearDown(self): - pass - - @requires_nccl() - def test_empty_tensors(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - xs = [torch.cuda.FloatTensor([])] - pg.broadcast(xs).wait() - self.assertEqual(0, xs[0].numel()) - - pg.allreduce(xs).wait() - self.assertEqual(0, xs[0].numel()) - - pg.reduce(xs).wait() - self.assertEqual(0, xs[0].numel()) - - ys = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]] - pg.allgather(ys, xs).wait() - for y in ys[0]: - self.assertEqual(0, y.numel()) - - ys = [torch.cuda.FloatTensor([])] - xs = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]] - pg.reduce_scatter(ys, xs).wait() - self.assertEqual(0, ys[0].numel()) - - @requires_nccl() - def test_broadcast_ops(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def broadcast(xs, rootRank, rootTensor): - opts = c10d.BroadcastOptions() - opts.rootRank = rootRank - opts.rootTensor = rootTensor - work = pg.broadcast(xs, opts) - work.wait() - - # for every root tensor - for rt in range(self.num_gpus): - tensors = [] - for i in range(self.num_gpus): - tensors.append(torch.tensor([i]).cuda(i)) - - broadcast(tensors, self.rank, rt) - - for i in range(self.num_gpus): - self.assertEqual(tensors[i], tensors[rt]) - - @requires_nccl() - def test_allreduce_ops(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def allreduce(tensors, op): - opts = c10d.AllreduceOptions() - opts.reduceOp = op - work = pg.allreduce(tensors, opts) - work.wait() - - # Sum - tensors = [] - for i in range(self.num_gpus): - tensors.append(torch.tensor([i + 1]).cuda(i)) - - allreduce(tensors, c10d.ReduceOp.SUM) - - for i in range(self.num_gpus): - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - torch.tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), - tensors[i], - ) - - # Product - tensors = [] - for i in range(self.num_gpus): - tensors.append(torch.tensor([i + 1]).cuda(i)) - - allreduce(tensors, c10d.ReduceOp.PRODUCT) - - for i in range(self.num_gpus): - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - torch.tensor([float(math.factorial(self.num_gpus))]), tensors[i] - ) - - # Min - tensors = [] - for i in range(self.num_gpus): - tensors.append(torch.tensor([i + 1]).cuda(i)) - - allreduce(tensors, c10d.ReduceOp.MIN) - - for i in range(self.num_gpus): - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.tensor([1.0]), tensors[i]) - - # Max - tensors = [] - for i in range(self.num_gpus): - tensors.append(torch.tensor([i + 1]).cuda(i)) - - allreduce(tensors, c10d.ReduceOp.MAX) - - for i in range(self.num_gpus): - self.assertEqual(torch.tensor([self.num_gpus]), tensors[i]) - - for op in (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR): - with self.assertRaisesRegex( - RuntimeError, "Cannot use " + str(op) + " with NCCL" - ): - allreduce(tensors, op) - - @requires_nccl() - def test_reduce_ops(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def reduce(xs, rootRank, rootTensor, op=None): - opts = c10d.ReduceOptions() - opts.rootRank = rootRank - opts.rootTensor = rootTensor - if op: - opts.reduceOp = op - work = pg.reduce(xs, opts) - work.wait() - - # for every root tensor - for rt in range(self.num_gpus): - tensors = [] - for i in range(self.num_gpus): - tensors.append(torch.tensor([i + 1]).cuda(i)) - - reduce(tensors, self.rank, rt) - - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - torch.tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), - tensors[rt], - ) - - for op in (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR): - with self.assertRaisesRegex( - RuntimeError, "Cannot use " + str(op) + " with NCCL" - ): - reduce(tensors, self.rank, rt, op) - - @requires_nccl() - def test_allgather_ops(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def allgather(output_ts, input_ts): - work = pg.allgather(output_ts, input_ts) - work.wait() - - tensors = [] - output_ts = [[] for _ in range(self.num_gpus)] - - for idx, ls in enumerate(output_ts): - for _ in range(self.world_size * self.num_gpus): - ls.append(torch.tensor([0]).cuda(idx)) - - for i in range(self.num_gpus): - tensors.append(torch.tensor([i]).cuda(i)) - - allgather(output_ts, tensors) - - # Verification - for device_ts in output_ts: - for s_idx, t in enumerate(device_ts): - self.assertEqual(torch.tensor([s_idx]), t) - - @requires_nccl() - def test_reduce_scatter_ops(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def reduce_scatter(outputs, input_lists, op): - opts = c10d.ReduceScatterOptions() - opts.reduceOp = op - work = pg.reduce_scatter(outputs, input_lists, opts) - work.wait() - - virtual_rank = self.rank * self.world_size - virtual_world_size = self.num_gpus * self.world_size - - output = [torch.tensor([0]).cuda(i) for i in range(self.num_gpus)] - - # 0 1 2 - # 0 [0..11] [1..12] - # 1 [3..14] - # 2 - # 3 - - # Sum - tensor_lists = [ - [ - torch.tensor([self.rank * self.num_gpus + i + j]).cuda(i) - for j in range(virtual_world_size) - ] - for i in range(self.num_gpus) - ] - - reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM) - - for i in range(self.num_gpus): - expected = torch.tensor( - [ - float(self.num_gpus * (self.num_gpus - 1) / 2) - + (virtual_rank + i) * virtual_world_size - ] - ) - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(expected, output[i]) - - # Min - reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN) - - for i in range(self.num_gpus): - expected = torch.tensor([self.rank * self.world_size + i]) - self.assertEqual(expected, output[i]) - - # Max - reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX) - - for i in range(self.num_gpus): - expected = torch.tensor( - [self.rank * self.world_size + i + virtual_world_size - 1] - ) - self.assertEqual(expected, output[i]) - - # Product - tensor_lists = [ - [ - torch.tensor( - [(self.rank * self.num_gpus + i + j) % virtual_world_size + 1] - ).cuda(i) - for j in range(virtual_world_size) - ] - for i in range(self.num_gpus) - ] - - reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT) - - for i in range(self.num_gpus): - expected = torch.tensor([float(math.factorial(virtual_world_size))]) - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(expected, output[i]) - - @requires_nccl() - def test_barrier(self): - store = c10d.FileStore(self.file.name, self.world_size) - pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def allreduce(tensors): - opts = c10d.AllreduceOptions() - work = pg.allreduce(tensors, opts) - return work - - # Making the collective to operate on - # 1, 2, 3, 4, .... self.num_gpus GPUs - tensors_list = [[] for _ in range(2, self.num_gpus + 1)] - for i in range(2, self.num_gpus + 1): - for j in range(i): - tensors_list[i - 2].append(torch.tensor([j + 1]).cuda(j)) - - works = [] - for tensors in tensors_list: - work = allreduce(tensors) - works.append(work) - - # Barrier will ensure that all previous work is completed - pg.barrier().wait() - - for i in range(2, self.num_gpus + 1): - for j in range(i): - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - torch.tensor([float(i * (i + 1) / 2)]), tensors_list[i - 2][j] - ) - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 50, bias=False) - self.fc3 = nn.Linear(50, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - x = self.fc3(x) - return F.softmax(x, dim=1) - - -class DoubleGpuNet(nn.Module): - def __init__(self, gpus): - super(DoubleGpuNet, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False).to(gpus[0]) - self.fc2 = nn.Linear(10, 50, bias=False).to(gpus[1]) - self.fc3 = nn.Linear(50, 4, bias=False).to(gpus[1]) - self.relu = nn.ReLU() - self.no_grad_param = nn.Parameter( - torch.tensor([2, 2]).long(), requires_grad=False - ).to(gpus[0]) - - def forward(self, x): - dev0 = self.fc1.weight.device - dev1 = self.fc2.weight.device - x = self.relu(self.fc1(x.to(dev0))) - x = self.relu(self.fc2(x.to(dev1))) - x = self.fc3(x) - return F.softmax(x, dim=1).to(dev0) - - -class QuadraGpuNet(nn.Module): - def __init__(self, gpus): - super(QuadraGpuNet, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False).to(gpus[0]) - self.fc2 = nn.Linear(10, 50, bias=False).to(gpus[1]) - self.fc3 = nn.Linear(50, 4, bias=False).to(gpus[2]) - self.fc4 = nn.Linear(4, 4, bias=False).to(gpus[3]) - self.relu = nn.ReLU() - self.no_grad_param = nn.Parameter( - torch.tensor([2, 2]).long(), requires_grad=False - ).to(gpus[0]) - - def forward(self, x): - dev0 = self.fc1.weight.device - dev1 = self.fc2.weight.device - dev2 = self.fc3.weight.device - dev3 = self.fc4.weight.device - x = self.relu(self.fc1(x.to(dev0))) - x = self.relu(self.fc2(x.to(dev1))) - x = self.relu(self.fc3(x.to(dev2))) - x = self.fc4(x.to(dev3)) - return F.softmax(x, dim=1).to(dev0) - - -class ConvNet(nn.Module): - def __init__(self, gpus, layouts, dtypes): - super(ConvNet, self).__init__() - self.dtypes = dtypes - if isinstance(gpus, list): - self.layer_gpus = gpus - else: - gpus = [gpus] * 4 - self.conv0 = torch.nn.Conv2d(8, 16, (2, 2)).to( - device=gpus[0], memory_format=layouts[0], dtype=dtypes[0] - ) - self.conv1 = torch.nn.Conv2d(16, 32, (2, 2)).to( - device=gpus[1], memory_format=layouts[1], dtype=dtypes[1] - ) - self.conv2 = torch.nn.Conv2d(32, 16, (2, 2)).to( - device=gpus[2], memory_format=layouts[2], dtype=dtypes[2] - ) - self.conv3 = torch.nn.Conv2d(16, 8, (2, 2)).to( - device=gpus[3], memory_format=layouts[3], dtype=dtypes[3] - ) - - def forward(self, x): - x = x.to(self.dtypes[0]) - # Could say - # x = self.conv0(x).to(device=self.conv1.weight.device, dtype=self.dtypes[1]) - # etc. But I don't want to appeal to the weights' devices directly, because part of this test's purpose - # is to verify weights are where expected if the model gets replicated. - gpus = self.layer_gpus if hasattr(self, "layer_gpus") else [x.device] * 4 - x = self.conv0(x).to(device=gpus[1], dtype=self.dtypes[1]) - x = self.conv1(x).to(device=gpus[2], dtype=self.dtypes[2]) - x = self.conv2(x).to(device=gpus[3], dtype=self.dtypes[3]) - return self.conv3(x) - - -class Task(nn.Module): - def __init__(self): - super().__init__() - self.p = nn.Parameter(torch.ones(2, 2)) - - def forward(self, x): - return self.p + x - - -class ModuleForDdpCommHook(nn.Module): - def __init__(self): - super().__init__() - self.t0 = Task() - - def forward(self, x, rank): - return self.t0(x + rank) - - -class SparseGradientModule(nn.Module): - def __init__(self): - super(SparseGradientModule, self).__init__() - self.embedding = nn.EmbeddingBag(10, 10, sparse=True) - - def forward(self, x): - return F.softmax(self.embedding(x), dim=1) - - -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) -class DistributedDataParallelTest(MultiProcessTestCase): - def setUp(self): - super(DistributedDataParallelTest, self).setUp() - # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests - # that use NCCL_BLOCKING_WAIT will test it as expected. - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() - - def tearDown(self): - # DistributedDataParallel test doesn't seem to call FileStore destructor - # TODO: investigate this test and the test is known to have issues - # Use this hack to remove files for that test - try: - os.remove(self.file_name) - except OSError: - pass - - @property - def world_size(self): - return 2 - - def _prepare_single_device_module( - self, - process_group, - devices, - device_ids, - global_batch_size, - gradient_as_bucket_view=False, - ): - model = Net() - device = devices[0] if devices else torch.device("cuda:%d" % self.rank) - ddp_model = DistributedDataParallel( - copy.deepcopy(model).to(device), - device_ids=device_ids, - process_group=process_group, - bucket_cap_mb=0.001, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - model.to(device) - - input = torch.randn(global_batch_size, 2).to(device) - target = torch.randn(global_batch_size, 4).to(device) - - return model, ddp_model, input, target - - def _prepare_multi_device_module( - self, - process_group, - devices, - device_ids, - global_batch_size, - gradient_as_bucket_view=False, - ): - self.assertTrue( - len(devices) == 2 or len(devices) == 4, - "unexpected devices for ddp tests {}".format(devices), - ) - if len(devices) == 2: - model = DoubleGpuNet(devices) - elif len(devices) == 4: - model = QuadraGpuNet(devices) - - ddp_model = DistributedDataParallel( - copy.deepcopy(model), - device_ids=device_ids, - process_group=process_group, - bucket_cap_mb=0.001, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - input = torch.randn(global_batch_size, 2).cuda(devices[0]) - target = torch.randn(global_batch_size, 4) - - return model, ddp_model, input, target - - def _test_ddp_with_process_group( - self, - process_group, - devices, - device_ids, - multi_device=False, - gradient_as_bucket_view=False, - ): - """ - Note: we pass down `device_ids` all the way to DistributedDataParallel - as part of the test. Below you find tests that either use a list of - integers, a list of `torch.Device` instances, or an empty list. - The `devices` argument is used to control placement of the model and - must always be specified as list of `torch.Device` instances. - """ - local_batch_size = 1 if devices is None else len(devices) - global_batch_size = self.world_size * local_batch_size - - if multi_device: - model, ddp_model, input, target = self._prepare_multi_device_module( - process_group, - devices, - device_ids, - global_batch_size, - gradient_as_bucket_view, - ) - ddp_logging_data = ddp_model.get_ddp_logging_data() - self.assertTrue(ddp_logging_data.is_multi_device_module) - else: - model, ddp_model, input, target = self._prepare_single_device_module( - process_group, - devices, - device_ids, - global_batch_size, - gradient_as_bucket_view, - ) - ddp_logging_data = ddp_model.get_ddp_logging_data() - self.assertFalse(ddp_logging_data.is_multi_device_module) - - def step_model(model, input, target): - model.train() - output = model(input) - loss = F.mse_loss(output, target.to(output.device)) - loss.backward() - - def update_parameters(model): - for param in model.parameters(): - with torch.no_grad(): - param -= param.grad - param.grad = None - - # check two model parameters over 2 iterations - for iteration in range(2): - # single cpu/gpu training - step_model(model, input, target) - - # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs - step_model( - ddp_model, - input[ - self.rank * local_batch_size : (self.rank + 1) * local_batch_size - ], - target[ - self.rank * local_batch_size : (self.rank + 1) * local_batch_size - ], - ) - - # Update weights and run a second iteration to shake out errors - update_parameters(model) - update_parameters(ddp_model) - self.assertEqual( - len(list(model.parameters())), len(list(ddp_model.parameters())) - ) - for i, j in zip(model.parameters(), ddp_model.parameters()): - self.assertEqual(i, j) - - # Shuffle the input so that DDP input is different - torch.manual_seed(1337 + iteration) - input = input[torch.randperm(global_batch_size)] - - def _test_gloo_backend( - self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False - ): - store = c10d.FileStore(self.file_name, self.world_size) - options = c10d.ProcessGroupGloo._Options() - options._devices = [create_device(interface=LOOPBACK)] - process_group = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, options - ) - self._test_ddp_with_process_group( - process_group, devices, device_ids, multi_device, gradient_as_bucket_view - ) - - @requires_gloo() - def test_gloo_backend_cpu_module(self): - self._test_gloo_backend([torch.device("cpu")], None) - - @requires_gloo() - def test_gloo_backend_cpu_module_grad_is_view(self): - self._test_gloo_backend([torch.device("cpu")], None, gradient_as_bucket_view=True) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_gloo_backend_1gpu_module_device_ids_integer_list(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:1] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_gloo_backend(devices, int_devices) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:1] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_gloo_backend(devices, devices) - - @requires_gloo() - @skip_if_lt_x_gpu(4) - def test_gloo_backend_2gpu_module(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:2] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_gloo_backend(devices, None, multi_device=True) - - @requires_gloo() - @skip_if_lt_x_gpu(8) - def test_gloo_backend_4gpu_module(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:4] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_gloo_backend(devices, None, multi_device=True) - - def _test_nccl_backend( - self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False - ): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - self._test_ddp_with_process_group( - process_group, devices, device_ids, multi_device, gradient_as_bucket_view - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_backend_multi_device_ids_not_allowed(self): - int_devices = list(range(torch.cuda.device_count())) - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - with self.assertRaisesRegex(ValueError, "device_ids can only be None or contain a single element."): - self._test_nccl_backend(devices, int_devices) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_backend_single_device_module_device_ids_None(self): - self._test_nccl_backend(None, None) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_backend_single_device_module_empty_device_ids(self): - # This tests the backward compatibility of accepting an empty list as `device_ids`, - # although we no longer document this in favor of the default value of `None`, - # which is consistent with multi-device modules and CPU modules. - self._test_nccl_backend(None, []) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_nccl_backend_multi_device_module_device_ids_None(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:2] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_nccl_backend(devices, None, multi_device=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_backend_1gpu_module_device_ids_integer_list(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:1] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_nccl_backend(devices, int_devices) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:1] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_nccl_backend(devices, devices) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_nccl_backend_2gpu_module(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:2] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_nccl_backend(devices, None, multi_device=True) - - @requires_nccl() - @skip_if_lt_x_gpu(8) - def test_nccl_backend_4gpu_module(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:4] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - self._test_nccl_backend(devices, None, multi_device=True) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_ddp_multi_device_module_config(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - - self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process") - - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - gpus = gpus[:2] - model = DoubleGpuNet(gpus) - - with self.assertRaisesRegex( - ValueError, - "DistributedDataParallel device_ids and output_device arguments only work with " - "single-device/multiple-device GPU modules or CPU modules", - ): - ddp_model = DistributedDataParallel( - model, output_device=gpus[1], process_group=process_group - ) - - with self.assertRaisesRegex(ValueError, "device_ids can only be None or contain a single element."): - ddp_model = DistributedDataParallel( - model, device_ids=gpus, process_group=process_group - ) - - with self.assertRaisesRegex( - ValueError, "input module must be on the same type of devices" - ): - model.fc1 = model.fc1.cpu() - ddp_model = DistributedDataParallel(model, process_group=process_group) - - model = model.cpu() - with self.assertRaisesRegex(ValueError, "device_ids can only be None or contain a single element."): - ddp_model = DistributedDataParallel( - model, device_ids=gpus, process_group=process_group - ) - - def _test_fp16(self, gradient_as_bucket_view=False): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - gpus = gpus_for_rank(self.world_size)[self.rank] - model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half() - nn.init.constant_(model.weight, 1) - ddp_model = DistributedDataParallel( - model, - device_ids=[gpus[0]], - process_group=process_group, - bucket_cap_mb=0.001, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - # Input 2**15, so that the gradients will overflow with a - # world_size of 2, unless we normalize the gradient by the - # world_size before the reduction - input = torch.tensor([[2 ** 15]]).cuda(gpus[0]).half() - - # Step model - ddp_model.train() - output = ddp_model(input) - loss = output.sum() - loss.backward() - - self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters())) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_fp16(self): - self._test_fp16() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_fp16_grad_is_view(self): - self._test_fp16(gradient_as_bucket_view=True) - - def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False): - """ - Note: this test can be sped up by only running it on a CPU module - once DistributedDataParallel supports them. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - class ForwardReturnValueModule(nn.Module): - def __init__(self): - super(ForwardReturnValueModule, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.fc3 = nn.Linear(4, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x, fn): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - # The first softmax does NOT include fc3 in its autograd graph - # whereas the second softmax DOES. If we pass only the first - # tensor we see in the output to the reducer, it marks the - # gradient for fc3 as ready (because it doesn't show up). If - # downstream uses of this return value choose to differentiate - # against the second output tensor, it would still receive a - # gradient and a callback for this tensor, resulting in a crash. - return fn( - F.softmax(x, dim=1), - F.softmax(self.fc3(x), dim=1), - ) - - device_id = gpus_for_rank(self.world_size)[self.rank][0] - model = DistributedDataParallel( - ForwardReturnValueModule().float().to(device_id), - device_ids=[device_id], - process_group=process_group, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - batch_size = 4 - criterion = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( - device_id - ) - - # Always run "backward" to ensure the reducer is called by autograd. - # If we don't correctly capture the output tensors from the return value, - # the reducer won't see a hook for the unused parameter, and throw an error. - # The correct capture is what we're testing in this function. - def test(box, unbox): - output = model(input, fn=box) - loss = criterion(unbox(output), target) - loss.backward() - - # Test with identity return value - test( - box=lambda x, y: (x, y), - unbox=lambda obj: obj[1], - ) - - # Test with list return value - test( - box=lambda x, y: ["foo", x, "bar", y], - unbox=lambda obj: obj[3], - ) - - # Test with tuple return value - test( - box=lambda x, y: ("foo", x, "bar", y), - unbox=lambda obj: obj[3], - ) - - # Test with dict return value - test( - box=lambda x, y: {"foo": "bar", "a": x, "b": y}, - unbox=lambda obj: obj["b"], - ) - - # Test with list with dict return value - test( - box=lambda x, y: ["foo", "bar", {"a": x, "b": y}], - unbox=lambda obj: obj[2]["b"], - ) - - # Test with dict with list return value - test( - box=lambda x, y: {"foo": "bar", "list": [0, x, 1, y]}, - unbox=lambda obj: obj["list"][3], - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_arbitrary_forward_return_value(self): - self._test_arbitrary_forward_return_value() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_arbitrary_forward_return_value_grad_is_view(self): - self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_with_lazy_parameters(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - with self.assertRaisesRegex( - RuntimeError, "Modules with uninitialized parameters" - ): - DistributedDataParallel( - torch.nn.LazyLinear(10), process_group=process_group - ) - - def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False): - """ - Note: this test can be sped up by only running it on a CPU module - once DistributedDataParallel supports them. - """ - torch.cuda.set_device(self.rank) - dist.init_process_group( - backend="nccl", - world_size=self.world_size, - rank=self.rank, - init_method=f"file://{self.file_name}" - ) - process_group = c10d.distributed_c10d._get_default_group() - - class FindUnusedParametersModule(nn.Module): - def __init__(self): - super(FindUnusedParametersModule, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.fc3 = nn.Linear(4, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - # Return the fc3 module so that the caller can invoke it - # outside of the forward function. While this is bad practice, - # we can use it to trigger a reducer error. - return (F.softmax(x, dim=1), self.fc3) - - device_id = gpus_for_rank(self.world_size)[self.rank][0] - batch_size = 4 - criterion = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( - device_id - ) - - ddp_model = None - - def test_find_unused_parameters( - find_unused_parameters, test_default=False, gradient_as_bucket_view=False - ): - if test_default: - model = DistributedDataParallel( - FindUnusedParametersModule().float().to(device_id), - device_ids=[device_id], - process_group=process_group, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - else: - model = DistributedDataParallel( - FindUnusedParametersModule().float().to(device_id), - device_ids=[device_id], - process_group=process_group, - find_unused_parameters=find_unused_parameters, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - nonlocal ddp_model - ddp_model = model - - output, fc3 = model(input) - output = fc3(output) - loss = criterion(output, target) - loss.backward() - - # First test that finding unused params under these conditions is to - # trigger an error when `backward` is called (because fc3 is an unused - # parameter and will therefore be marked ready twice). - try: - test_find_unused_parameters( - True, gradient_as_bucket_view=gradient_as_bucket_view - ) - except Exception as ex: - self.assertTrue( - str(ex).startswith( - "Expected to mark a variable ready only once.", - ) - ) - unused_index = 2 - unused_index_str = f"Parameter at index {unused_index}" - model = ddp_model.module - for module_name, module in model.named_modules(): - if module == model.fc3: - for parameter_name, _ in module.named_parameters( - recurse=False - ): - unused_fqn = f"{module_name}.{parameter_name}" - # Only one such parameter in model.fc3, since bias=False - break - - if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF: - unused_index_str += f" with name {unused_fqn}" - - self.assertTrue(unused_index_str in str(ex)) - else: - self.fail("Expected exception") - - dist.barrier(process_group) - - # Then test that the default behavior can be overridden by setting - # `find_unused_parameters=False`. - try: - test_find_unused_parameters( - False, gradient_as_bucket_view=gradient_as_bucket_view - ) - except Exception as ex: - self.fail("Unexpected exception: %s" % ex) - - # Test find_unused_parameters defaults to False - try: - test_find_unused_parameters( - True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view - ) - except Exception as ex: - self.fail("Unexpected exception: %s" % ex) - - # TODO: Combine the following tests once https://github.com/pytorch/pytorch/issues/55967 - # is resolved. - @requires_nccl() - @skip_if_lt_x_gpu(2) - @with_dist_debug_levels(levels=["DETAIL"]) - def test_find_unused_parameters_kwarg_debug_detail(self): - self._test_find_unused_parameters_kwarg() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @with_dist_debug_levels(levels=["INFO"]) - def test_find_unused_parameters_kwarg_debug_info(self): - self._test_find_unused_parameters_kwarg() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @with_dist_debug_levels(levels=["OFF"]) - def test_find_unused_parameters_kwarg_debug_off(self): - self._test_find_unused_parameters_kwarg() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @with_dist_debug_levels(levels=["DETAIL"]) - def test_find_unused_parameters_kwarg_grad_is_view_debug_detail(self): - self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @with_dist_debug_levels(levels=["INFO"]) - def test_find_unused_parameters_kwarg_grad_is_view_debug_info(self): - self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @with_dist_debug_levels(levels=["OFF"]) - def test_find_unused_parameters_kwarg_grad_is_view_debug_off(self): - self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) - - def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False): - """ - By simulating a multi-task training, this test is to make sure: - 1) DDP does not touch the grad of globally unused parameters. - 2) DDP does update the grad of locally unused parameters. - """ - - class GlobalLocalUnusedParamModule(nn.Module): - def __init__(self): - super(GlobalLocalUnusedParamModule, self).__init__() - self.t0 = Task() - self.t1 = Task() - self.task_unused = Task() - - def task_parameters(self): - return (self.t0.p, self.t1.p, self.task_unused.p) - - def forward(self, x, rank): - return self.t0(x) if rank == 0 else self.t1(x) - - def run_and_verify_grad(model): - # Run forward - output = model(8, self.rank) - - # The grads of all parameters should be None at this point. - t0_p, t1_p, task_unused_p = model.module.task_parameters() - self.assertIsNone(t0_p.grad) - self.assertIsNone(t1_p.grad) - self.assertIsNone(task_unused_p.grad) - - # Run backward - output.mean().backward() - - # Now locally unused parameter should have grad updated on all ranks. - # However the globally unused parameter should still have None grad. - self.assertIsNotNone(t0_p.grad) - self.assertIsNotNone(t1_p.grad) - self.assertIsNone(task_unused_p.grad) - - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - # Test on CPU - cpu_model = DistributedDataParallel( - GlobalLocalUnusedParamModule().cpu(), - process_group=process_group, - find_unused_parameters=True, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - run_and_verify_grad(cpu_model) - - # Test on GPU - device_id = gpus_for_rank(self.world_size)[self.rank][0] - gpu_model = DistributedDataParallel( - GlobalLocalUnusedParamModule().to(device_id), - device_ids=[device_id], - process_group=process_group, - find_unused_parameters=True, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - run_and_verify_grad(gpu_model) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_global_local_unused_params_grad(self): - self._test_global_local_unused_params_grad() - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_global_local_unused_params_grad_with_grad_is_view(self): - self._test_global_local_unused_params_grad(gradient_as_bucket_view=True) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_find_unused_parameters_when_unused_parameters_empty(self): - """ - An empty unused_parameters array does not imply find_unused_parameters = - false. This test makes sure that DDP allreduces unused parameters - accordingly where the forward pass in some process uses all parameters. - This unit test creates a module that uses all parameters in rank = 0, and - has unused parameters in other ranks. - """ - - class FindUnusedParamModule(nn.Module): - def __init__(self): - super(FindUnusedParamModule, self).__init__() - self.t0 = Task() - self.t1 = Task() - - def task_parameters(self): - return (self.t0.p, self.t1.p) - - def forward(self, x, rank): - return self.t1(self.t0(x)) if rank == 0 else self.t1(x) - - def run_and_verify_grad(model): - # Run forward - output = model(8, self.rank) - - # The grads of all parameters should be None at this point. - [self.assertIsNone(t_p.grad) for t_p in model.module.task_parameters()] - - # Run backward - output.mean().backward() - - # Now locally unused parameter should have grad updated on all ranks. - [self.assertIsNotNone(t_p.grad) for t_p in model.module.task_parameters()] - - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - # Test on CPU - cpu_model = DistributedDataParallel( - FindUnusedParamModule().cpu(), - process_group=process_group, - find_unused_parameters=True, - ) - run_and_verify_grad(cpu_model) - - # Test on GPU - device_id = gpus_for_rank(self.world_size)[self.rank][0] - gpu_model = DistributedDataParallel( - FindUnusedParamModule().to(device_id), - device_ids=[device_id], - process_group=process_group, - find_unused_parameters=True, - ) - run_and_verify_grad(gpu_model) - - def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False): - """ - Note: this test can be sped up by only running it on a CPU module - once DistributedDataParallel supports them. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - class MultipleOutputModule(nn.Module): - def __init__(self): - super(MultipleOutputModule, self).__init__() - - def define_module(): - return nn.Sequential( - nn.Linear(2, 10, bias=False), - nn.ReLU(), - nn.Linear(10, 4, bias=False), - nn.ReLU(), - ) - - self.module0 = define_module() - self.module1 = define_module() - - def forward(self, x): - return ( - F.softmax(self.module0(x), dim=1), - F.softmax(self.module1(x), dim=1), - ) - - device_id = gpus_for_rank(self.world_size)[self.rank][0] - model = DistributedDataParallel( - MultipleOutputModule().float().to(device_id), - device_ids=[device_id], - process_group=process_group, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - batch_size = 4 - criterion = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( - device_id - ) - - # Compute loss and gradients for both outputs - output1, output2 = model(input) - loss1 = criterion(output1, target) - loss1.backward() - loss2 = criterion(output2, target) - loss2.backward() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_multiple_outputs_multiple_backward(self): - self._test_multiple_outputs_multiple_backward() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_multiple_outputs_multiple_backward_grad_is_view(self): - self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_no_grad(self): - """ - Note: this test can be sped up by only running it on a CPU module - once DistributedDataParallel supports them. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - class NoGradModule(nn.Module): - def __init__(self): - super(NoGradModule, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - return F.softmax(x, dim=1) - - device_id = gpus_for_rank(self.world_size)[self.rank][0] - model = DistributedDataParallel( - NoGradModule().float().to(device_id), - device_ids=[device_id], - process_group=process_group, - ) - - batch_size = 4 - input = torch.rand([batch_size, 2], dtype=torch.float) - - def check_no_grads(): - for p in model.parameters(): - self.assertTrue(p.requires_grad) - self.assertIsNone(p.grad) - - # After initialization, no parameter has their gradient set. - check_no_grads() - - # Run `forward` function with torch.no_grad() - with torch.no_grad(): - output = model(input) - self.assertTrue(isinstance(output, torch.Tensor)) - - # No parameter should have their gradient set. - check_no_grads() - - def _test_accumulate_gradients_no_sync( - self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False - ): - """ - This is the recommended way to implement accumulate grads. - If ``ddp_comm_hook`` input was specified, it will also register that hook - to the ``ddp_model``. The hook fed into this function should not change - the resulting gradients. - """ - int_devices = gpus_for_rank(self.world_size)[self.rank][:1] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - global_batch_size = self.world_size - local_batch_size = len(devices) - - model, ddp_model, input, target = self._prepare_single_device_module( - process_group, devices, devices, global_batch_size, gradient_as_bucket_view - ) - - if ddp_comm_hook is not None: - ddp_model.register_comm_hook(process_group, ddp_comm_hook) - - def step_model(model, input, target): - model.train() - output = model(input) - loss = F.mse_loss(output, target.to(output.device)) - loss.backward() - - # ensure accumulate grads works with no_grad - with torch.no_grad(): - with ddp_model.no_sync(): - ddp_model.train() - ddp_model(input) - - # check two model parameters over num_iters iterations - for iteration in range(num_iters): - # single cpu/gpu training - step_model(model, input, target) - - ddp_input = input[ - self.rank * local_batch_size : (self.rank + 1) * local_batch_size - ] - ddp_target = target[ - self.rank * local_batch_size : (self.rank + 1) * local_batch_size - ] - - if iteration % num_iters == 0: - # accumulate grads locally - with ddp_model.no_sync(): - step_model(ddp_model, ddp_input, ddp_target) - else: - # sync grads - step_model(ddp_model, ddp_input, ddp_target) - - for i, j in zip(model.parameters(), ddp_model.parameters()): - if iteration % num_iters == 0: - self.assertNotEqual(i.grad, j.grad) - else: - self.assertEqual(i.grad, j.grad) - - # Shuffle the input so that DDP input is different - torch.manual_seed(1337 + iteration) - input = input[torch.randperm(global_batch_size)] - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_accumulate_gradients_no_sync(self): - """ - Runs _test_accumulate_gradients_no_sync using default inputs - """ - self._test_accumulate_gradients_no_sync() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_accumulate_gradients_no_sync_grad_is_view(self): - """ - Runs _test_accumulate_gradients_no_sync using default inputs - """ - self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_accumulate_gradients_no_sync_allreduce_hook(self): - """ - Runs multiple iterations on _test_accumulate_gradients_no_sync - using allreduce hook and validates whether future result was properly - passed as gradients in reducer. - """ - - def allreduce_hook( - process_group: object, bucket: dist.GradBucket - ) -> torch._C.Future: - tensors = [bucket.get_tensor() / self.world_size] - return process_group.allreduce(tensors).get_future() - - self._test_accumulate_gradients_no_sync( - num_iters=4, ddp_comm_hook=allreduce_hook - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self): - """ - Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce - hook that also uses then callbacks. In first then callback result is multiplied - by 2, and the second callback divides the result by 2 * world_size. It validates - whether final result was properly passed as gradients in reducer. - """ - - def allreduce_with_then_hook( - process_group: object, bucket: dist.GradBucket - ) -> torch.futures.Future: - fut = process_group.allreduce([bucket.get_tensor()]).get_future() - - def mult(fut): - # Multiply the result by 2. - return [2 * t for t in fut.wait()] - - def div(fut): - # Divide the result by 2 * world_size. - return [t / (2 * self.world_size) for t in fut.wait()] - - return fut.then(mult).then(div) - - self._test_accumulate_gradients_no_sync( - num_iters=4, ddp_comm_hook=allreduce_with_then_hook - ) - - def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False): - # This is NOT the recommended way to implement accumulating grads, but - # we would like to make sure DDP does not mess up with the underlying - # module. - int_devices = gpus_for_rank(self.world_size)[self.rank][:1] - devices = [torch.device("cuda:" + str(i)) for i in int_devices] - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - global_batch_size = self.world_size - - model, ddp_model, input, target = self._prepare_single_device_module( - process_group, devices, devices, global_batch_size, gradient_as_bucket_view - ) - - def step_model(model, input, target): - model.train() - output = model(input) - loss = F.mse_loss(output, target.to(output.device)) - loss.backward() - - # ensure accumulate grads works with no_grad - with torch.no_grad(): - ddp_model.train() - ddp_model.module(input) - - # Check two model parameters over 4 iterations. - # Use 4 iterations because we alternate between reducing and - # not reducing and want to make sure we switch both ways. - for iteration in range(4): - step_model(model, input, target) - - if iteration % 2 == 0: - # Skip gradients sync without calling prepare_for_backward - step_model( - ddp_model.module, - input[self.rank : (self.rank + 1)], - target[self.rank : (self.rank + 1)], - ) - for i, j in zip(model.parameters(), ddp_model.parameters()): - self.assertNotEqual(i.grad, j.grad) - else: - step_model( - ddp_model, - input[self.rank : (self.rank + 1)], - target[self.rank : (self.rank + 1)], - ) - for i, j in zip(model.parameters(), ddp_model.parameters()): - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(i.grad, j.grad) - - # Shuffle the input so that DDP input is different - torch.manual_seed(1337 + iteration) - input = input[torch.randperm(global_batch_size)] - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_accumulate_gradients_module(self): - self._test_accumulate_gradients_module() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_accumulate_gradients_module_with_grad_is_view(self): - self._test_accumulate_gradients_module(gradient_as_bucket_view=True) - - @requires_gloo() - def test_ignored_output(self): - """ - Test that the output of a model can be ignored and that there is no - implicit requirement that `backward` gets called. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - class IgnoredOutput(nn.Module): - def __init__(self): - super(IgnoredOutput, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - return F.softmax(x, dim=1) - - model = DistributedDataParallel( - IgnoredOutput().float(), - process_group=process_group, - ) - - batch_size = 4 - criterion = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) - - # Run a few iterations where we ignore the output. - for _ in range(4): - output = model(input) - del output - - # Run a few iterations where we use the output. - for _ in range(4): - output = model(input) - loss = criterion(output, target) - loss.backward() - - @requires_gloo() - def test_ignored_output_with_unused_parameters(self): - """ - Test that the output of a model can be ignored and that there is no - implicit requirement that `backward` gets called, if not all model - parameters participated in computing the model output. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - class IgnoredOutputWithUnusedParameters(nn.Module): - def __init__(self): - super(IgnoredOutputWithUnusedParameters, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.fc3 = nn.Linear(4, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - return F.softmax(x, dim=1) - - model = DistributedDataParallel( - IgnoredOutputWithUnusedParameters().float(), - process_group=process_group, - find_unused_parameters=True, - ) - - batch_size = 4 - criterion = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) - - # Run a few iterations where we ignore the output. - for _ in range(4): - output = model(input) - del output - - # Run a few iterations where we use the output. - for _ in range(4): - output = model(input) - loss = criterion(output, target) - loss.backward() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_failure_recovery(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - # need to create a separate file for the recovered FileStore, because - # the original one will be deleted when destructing the first FileStore. - recovery_filename = self.file_name + "_recovery" - - if self.rank == 0: - # the file will be deleted by the recovered FileStore - open(recovery_filename, "w").close() - - # not necessary to run barrier here, as DDP will synchronize - - class TestModel(nn.Module): - def __init__(self): - super(TestModel, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - return F.softmax(x, dim=1) - - device_id = gpus_for_rank(self.world_size)[self.rank][0] - model = TestModel().float().to(device_id) - ddp = DistributedDataParallel( - model, - device_ids=[device_id], - process_group=process_group, - ) - - batch_size = 4 - criterion = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( - device_id - ) - - for _ in range(6): - output = ddp(input) - loss = criterion(output, target) - loss.backward() - - del ddp - del process_group - del store # this will delete self.file_name - - store = c10d.FileStore(recovery_filename, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - ddp = DistributedDataParallel( - model, - device_ids=[device_id], - process_group=process_group, - ) - - input = torch.rand([batch_size, 2], dtype=torch.float) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( - device_id - ) - for _ in range(6): - output = ddp(input) - loss = criterion(output, target) - loss.backward() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_pass_default_pg(self): - dist.init_process_group( - "nccl", - init_method=f"file://{self.file_name}", - world_size=self.world_size, - rank=self.rank, - ) - - default_pg = c10d.distributed_c10d._get_default_group() - dist.destroy_process_group(default_pg) - self.assertFalse(dist.is_initialized()) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_save_load_checkpoint(self): - dist.init_process_group( - "gloo", - init_method=f"file://{self.file_name}", - world_size=self.world_size, - rank=self.rank, - ) - - class TestModel(nn.Module): - def __init__(self): - super(TestModel, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - return F.softmax(x, dim=1) - - def train_loop(model, optimizer, iterations): - for _ in range(iterations): - optimizer.zero_grad() - output = model(input) - loss = criterion(output, target) - loss.backward() - optimizer.step() - - device_id = gpus_for_rank(self.world_size)[self.rank][0] - - model_withload = TestModel().float().to(device_id) - model_withoutload = TestModel().float().to(device_id) - - ddp_withload = DistributedDataParallel( - model_withload, - device_ids=[device_id], - ) - ddp_withoutload = DistributedDataParallel( - model_withoutload, - device_ids=[device_id], - ) - - # ensure that all the three models start with the same set of parameters. By default they are randomized on construction - for p in ddp_withload.parameters(): - with torch.no_grad(): - p.zero_() - for p in model_withload.parameters(): - with torch.no_grad(): - p.zero_() - for p in ddp_withoutload.parameters(): - with torch.no_grad(): - p.zero_() - - batch_size = 4 - criterion = nn.CrossEntropyLoss() - - optimizer_withload = torch.optim.SGD(ddp_withload.parameters(), lr=0.001) - optimizer_non_ddp_withload = torch.optim.SGD(model_withload.parameters(), lr=0.001) - optimizer_withoutload = torch.optim.SGD(ddp_withoutload.parameters(), lr=0.001) - - input = torch.rand([batch_size, 2], dtype=torch.float).to(device_id) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( - device_id - ) - - # run the model for 6 iterations, with a checkpoint in the middle - train_loop(ddp_withload, optimizer_withload, 3) - - # zero out parameters of both DDP and non-DDP models and reload them from the DDP state dict - checkpoint_path = tempfile.gettempdir() + "/model.checkpoint" - if self.rank == 0: - torch.save(ddp_withload.state_dict(), checkpoint_path) - - dist.barrier() - map_location = {"cuda:%d" % 0: "cuda:%d" % self.rank} - ddp_state_dict = torch.load(checkpoint_path, map_location=map_location) - - for model in [ddp_withload, model_withload]: - for p in ddp_withload.parameters(): - with torch.no_grad(): - p.zero_() - ddp_withload.load_state_dict(ddp_state_dict) - # the non-DDP model needs to first remove the prefix of "module." from the DDP state dict - torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(ddp_state_dict, "module.") - model_withload.load_state_dict(ddp_state_dict) - - train_loop(ddp_withload, optimizer_withload, 3) - train_loop(model_withload, optimizer_non_ddp_withload, 3) - - # re-run the model with the same inputs for 6 iterations with no checkpoint - train_loop(ddp_withoutload, optimizer_withoutload, 6) - - for p_withload, p_withoutload, p_non_ddp_withload in zip( - ddp_withload.parameters(), ddp_withoutload.parameters(), model_withload.parameters() - ): - self.assertEqual(p_withload, p_withoutload) - self.assertEqual(p_non_ddp_withload, p_withoutload) - - def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): - mult = 2 - batch_size = mult * self.world_size - criterion = nn.CrossEntropyLoss() - input = torch.randint(0, 10, [batch_size, 2]) - target = torch.randint(0, 10, [batch_size]) - - # Run with entire batch against single process version - criterion(vanilla_model(input), target).backward() - - # Run with partial batch against multi process version - partial_input = input.split(mult)[self.rank] - partial_target = target.split(mult)[self.rank] - criterion(ddp_model(partial_input), partial_target).backward() - - # Check that the gradients are sparse and identical - vanilla_parameter = next(vanilla_model.parameters()) - ddp_parameter = next(ddp_model.parameters()) - self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) - - def _test_sparse_gradients(self, gradient_as_bucket_view=False): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - # Ensure initialized weights and inputs are identical across processes - torch.manual_seed(1337) - - vanilla_model = SparseGradientModule() - ddp_model = DistributedDataParallel( - copy.deepcopy(vanilla_model), - process_group=process_group, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) - - @requires_gloo() - def test_sparse_gradients(self): - self._test_sparse_gradients() - - @requires_gloo() - def test_sparse_gradients_grad_is_view(self): - self._test_sparse_gradients(gradient_as_bucket_view=True) - - def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - global_batch_size = local_batch_size * self.world_size - - # Carry out some trials with small buckets and some with big buckets. - bucketsizes = (0.000001, 25) - # Tuples of lists. Each list describes per-layer characteristics for one trial. - layer_formats = ( - [torch.contiguous_format] * 4, - [torch.channels_last] * 2 + [torch.contiguous_format] * 2, - [torch.channels_last] * 4, - ) - layer_dtypes = ( - [torch.float] * 4, - [torch.float] * 2 + [torch.half] * 2, - [torch.half] * 4, - ) - - input_dev = layer_devs[0] if isinstance(layer_devs, list) else layer_devs - target_dev = layer_devs[-1] if isinstance(layer_devs, list) else layer_devs - input = torch.randn( - (global_batch_size, 8, 8, 8), device=input_dev, dtype=torch.float - ) - target = torch.randn( - (global_batch_size, 8, 4, 4), device=target_dev, dtype=torch.float - ) - local_batch_start = self.rank * local_batch_size - local_batch_end = (self.rank + 1) * local_batch_size - - # Reducer.cpp sneakily creates one "initial bucket" that ignores the "bucket_cap_mb" - # argument. The following makes sure the initial bucket also complies. - @contextmanager - def first_bucket_size(ddp_bucket_mb): - old_DEFAULT_FIRST_BUCKET_BYTES = dist._DEFAULT_FIRST_BUCKET_BYTES - dist._DEFAULT_FIRST_BUCKET_BYTES = int(ddp_bucket_mb * 1.0e6) - try: - yield - finally: - dist._DEFAULT_FIRST_BUCKET_BYTES = old_DEFAULT_FIRST_BUCKET_BYTES - - with torch.backends.cudnn.flags( - enabled=True, deterministic=True, benchmark=False - ): - for formats, dtypes, bucketsize in product( - layer_formats, layer_dtypes, bucketsizes - ): - with first_bucket_size(bucketsize): - model_msg = ( - "rank = {} formats = {} dtypes = {} bucketsize = {} ".format( - self.rank, formats, dtypes, bucketsize - ) - ) - try: - m = ConvNet(layer_devs, formats, dtypes) - m_ddp = DistributedDataParallel( - copy.deepcopy(m), - device_ids=replica_devices, - process_group=process_group, - bucket_cap_mb=bucketsize, - ) - opt = torch.optim.SGD(m.parameters(), lr=0.1) - opt_ddp = torch.optim.SGD(m_ddp.parameters(), lr=0.1) - has_half = any(p.dtype is torch.half for p in m.parameters()) - tol = 1.0e-3 if has_half else 1.0e-5 - except BaseException: - # Prints case-specific debugging info to narrow down failing case. - print( - "Caught exception during model creation for " + model_msg, - flush=True, - ) - raise - # 3 iters: First iter creates grads, second iter retests after rebucketing, - # third iter tries zeroed grads. - for it in range(3): - iter_msg = "iter = {} ".format(it) + model_msg - named_msg = iter_msg - try: - F.mse_loss(m(input).float(), target).backward() - F.mse_loss( - m_ddp(input[local_batch_start:local_batch_end]).float(), - target[local_batch_start:local_batch_end], - ).backward() - for i, ((layer_name, m_child), m_ddp_child) in enumerate( - zip(m.named_children(), m_ddp.module.children()) - ): - named_msg = layer_name + ".weight" + " " + iter_msg - self.assertTrue( - m_child.weight.grad.is_contiguous( - memory_format=formats[i] - ), - named_msg, - ) - self.assertTrue( - m_ddp_child.weight.grad.is_contiguous( - memory_format=formats[i] - ), - named_msg, - ) - for j, ((param_name, p), p_ddp) in enumerate( - zip( - m_child.named_parameters(), - m_ddp_child.parameters(), - ) - ): - named_msg = ( - layer_name + "." + param_name + " " + iter_msg - ) - self.assertEqual( - p.grad, p_ddp.grad, rtol=tol, atol=tol - ) - opt.step() - opt_ddp.step() - if it == 0: - for p, p_ddp in zip(m.parameters(), m_ddp.parameters()): - p.grad = None - p_ddp.grad = None - else: - m.zero_grad() - m_ddp.zero_grad() - except BaseException: - # Makes sure we still get info if an error occurred somewhere other than the asserts. - print( - "Caught exception during iterations at " + named_msg, - flush=True, - ) - raise - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @skip_if_rocm - def test_grad_layout_1devicemodule_1replicaperprocess(self): - dev0 = torch.device("cuda:" + str(gpus_for_rank(self.world_size)[self.rank][0])) - # Tells DDP to use just one device. - replica_devices = [dev0] - # Tells _test_grad_layout to construct ConvNet with all layers on this process's first assigned device. - layer_devs = dev0 - local_batch_size = 8 - self._test_grad_layout(replica_devices, layer_devs, local_batch_size) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - @skip_if_rocm - def test_grad_layout_2devicemodule(self): - int_devices = gpus_for_rank(self.world_size)[self.rank][:2] - dev0 = torch.device("cuda:" + str(int_devices[0])) - dev1 = torch.device("cuda:" + str(int_devices[1])) - # DDP's default behavior for a multi-device module is "don't replicate." - replica_devices = None - # Tells _test_grad_layout to constructs this process's ConvNet on 2 devices, with 2 layers on each device. - layer_devs = [dev0] * 2 + [dev1] * 2 - local_batch_size = 8 - self._test_grad_layout(replica_devices, layer_devs, local_batch_size) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_param_layout_mismatch_error(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - dev0 = torch.device("cuda:" + str(gpus_for_rank(self.world_size)[self.rank][0])) - layer_devs = dev0 - layer_formats = ( - [torch.contiguous_format] * 4 - if self.rank == 0 - else [torch.channels_last] * 4 - ) - layer_dtypes = [torch.float] * 4 - - m = ConvNet(layer_devs, layer_formats, layer_dtypes) - if self.rank == 0: - m_ddp = DistributedDataParallel( - m, device_ids=[dev0], process_group=process_group - ) - else: - with self.assertRaisesRegex( - RuntimeError, - ".* appears not to match strides of the same param in process 0", - ): - m_ddp = DistributedDataParallel( - m, device_ids=[dev0], process_group=process_group - ) - - @requires_gloo() - def test_ddp_comm_hook_future_passing_cpu(self): - """ - This unit test verifies whether the Future object is passed properly. - The callback function creates a Future object and sets a value to it. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - # Test on CPU - cpu_model = DistributedDataParallel( - ModuleForDdpCommHook().cpu(), process_group=process_group - ) - - # Register DDP Communication Hook - cpu_model.register_comm_hook(None, self._simple_hook) - - # check whether the grads are equal to what then callback returns. - # without the comm_hook, result would be 0.25 * torch.ones(2, 2). - self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2)) - - def _gpu_model_with_ddp_comm_hook( - self, process_group, hook=None, gradient_as_bucket_view=False, state=None - ): - device_id = gpus_for_rank(self.world_size)[self.rank][0] - gpu_model = DistributedDataParallel( - ModuleForDdpCommHook().to(device_id), - device_ids=[device_id], - process_group=process_group, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - # Register a DDP communication hook if any. - if hook is not None: - gpu_model.register_comm_hook(state, hook) - - return gpu_model - - def _gpu_model_with_builtin_ddp_comm_hook( - self, process_group, hook=None, gradient_as_bucket_view=False - ): - device_id = gpus_for_rank(self.world_size)[self.rank][0] - gpu_model = DistributedDataParallel( - ModuleForDdpCommHook().to(device_id), - device_ids=[device_id], - process_group=process_group, - gradient_as_bucket_view=gradient_as_bucket_view, - ) - - # Register a built-in DDP communication hook if defined - if hook is not None: - gpu_model._register_builtin_comm_hook(hook) - - return gpu_model - - def _run_and_verify_hook(self, model, input, expected_grad): - # Run forward - output = model(input, self.rank) - - # Run backward - output.mean().backward() - - [self.assertEqual(p.grad, expected_grad) for p in model.parameters()] - - def _simple_hook( - self, state: object, bucket: dist.GradBucket - ) -> torch.futures.Future: - fut = torch.futures.Future() - fut.set_result([torch.ones_like(bucket.get_tensor())]) - - def fut_then(fut): - # Add ones to fut's result. - return [t + torch.ones_like(t) for t in fut.value()] - - return fut.then(fut_then) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_ddp_comm_hook_future_passing_gpu_gloo(self): - """ - This unit test verifies whether the Future object is passed properly using gloo backend. - The hook callback function creates a Future object and sets a value to it. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - # Get GPU model with simple_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook) - - # check whether the grads are equal to what simple_hook's then callback returns. - # without the comm_hook, result would be 0.25 * torch.ones(2, 2). - self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_comm_hook_future_passing_gpu_nccl(self): - """ - This unit test verifies whether the Future object is passed properly using nccl backend. - The hook callback function creates a Future object and sets a value to it. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - # Get GPU model with simple_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook) - - # check whether the grads are equal to what simple_hook's then callback returns. - # without the comm_hook, result would be 0.25 * torch.ones(2, 2). - self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) - - def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False): - """ - This unit test verifies whether a DDP communication hook that just calls - allreduce gives the same result with the case of no hook registered. - Without the then callback, the future_value in reducer is no longer - a PyObject, and this unit test verifies future_value is properly checked. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def allreduce_hook(state: object, bucket: dist.GradBucket) -> torch._C.Future: - tensors = [bucket.get_tensor() / self.world_size] - return process_group.allreduce(tensors).get_future() - - # Get GPU model with allreduce_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook( - process_group, allreduce_hook, gradient_as_bucket_view - ) - - # check whether the grads are equal to what DDP without hook would return. - self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) - - def _test_default_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False): - """ - This unit test verifies whether default Python DDP communication hooks ALLREDUCE and FP16_COMPRESS - can give the same result with the case of no hook registered. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - # For these default DDP comm hooks, the only state is process group. - state = process_group - for hook in [default.allreduce_hook, default.fp16_compress_hook]: - # Get GPU model with the hook registered. - # The first arg 'process_group' is used for initializing the test environment, - # so it cannot be replaced by 'state', although they have the same value. - gpu_model = self._gpu_model_with_ddp_comm_hook( - process_group, hook, gradient_as_bucket_view, state - ) - - # check whether the grads are equal to what DDP without hook would return. - self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) - - def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False): - """ - This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with - the FP16_WRAPPER can give the same result as when there is no hook registered. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - powerSGD_state = powerSGD.PowerSGDState(process_group=process_group) - - hook_args = [(powerSGD.powerSGD_hook, powerSGD_state), (default.allreduce_hook, process_group)] - - for hook, state in hook_args: - gpu_model = self._gpu_model_with_ddp_comm_hook( - process_group, - default.fp16_compress_wrapper(hook), - gradient_as_bucket_view, - state - ) - - # check whether the grads are equal to what DDP without hook would return. - self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) - - def _test_powerSGD_ddp_comm_hook_nccl(self, gradient_as_bucket_view=False): - """ - This unit test verifies whether Python DDP communication hook POWER_SGD - can give the same result with the case of no hook registered. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - # Get GPU model with the hook registered. - # Test the hook with different algorithmic configs. - for use_error_feedback, warm_start in product([True, False], [True, False]): - state = powerSGD.PowerSGDState( - process_group=process_group, - matrix_approximation_rank=1, - use_error_feedback=use_error_feedback, - warm_start=warm_start, - ) - for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]: - gpu_model = self._gpu_model_with_ddp_comm_hook( - process_group, hook, gradient_as_bucket_view, state - ) - - # check whether the grads are equal to what DDP without hook would return. - self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) - - def _test_builtin_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False): - """ - This unit test verifies whether built-in C++ DDP communication hooks ALLREDUCE and FP16_COMPRESS - can give the same result with the case of no hook registered. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - for comm_hook_type in [ - dist.BuiltinCommHookType.ALLREDUCE, - dist.BuiltinCommHookType.FP16_COMPRESS, - ]: - # Get GPU model with the built-in communication hook. - gpu_model = self._gpu_model_with_builtin_ddp_comm_hook( - process_group, comm_hook_type, gradient_as_bucket_view - ) - - # check whether the grads are equal to what DDP without hook would return. - self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_comm_hook_allreduce_hook_nccl(self): - self._test_ddp_comm_hook_allreduce_hook_nccl() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_default_ddp_comm_hooks_nccl(self): - self._test_default_ddp_comm_hooks_nccl() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_fp16_compress_wrapper_nccl(self): - self._test_fp16_compress_wrapper() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_builtin_ddp_comm_hooks_nccl(self): - self._test_builtin_ddp_comm_hooks_nccl() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_powerSGD_ddp_comm_hook_nccl(self): - self._test_powerSGD_ddp_comm_hook_nccl() - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self): - self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True) - - def test_invalid_powerSGD_state(self): - for start_powerSGD_iter, use_error_feedback, warm_start in product( - [0, 1], [True, False], [True, False] - ): - if not use_error_feedback and not warm_start: - continue - with self.assertRaisesRegex( - ValueError, - "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, " - "because PowerSGD can only be applied after the first two iterations in DDP.", - ): - state = powerSGD.PowerSGDState( - process_group=None, - matrix_approximation_rank=1, - start_powerSGD_iter=start_powerSGD_iter, - use_error_feedback=use_error_feedback, - warm_start=warm_start, - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_default_ddp_comm_hooks_nccl_is_view(self): - self._test_default_ddp_comm_hooks_nccl(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_fp16_compress_wrapper_is_view(self): - self._test_fp16_compress_wrapper(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_builtin_ddp_comm_hooks_nccl_grad_is_view(self): - self._test_builtin_ddp_comm_hooks_nccl(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_powerSGD_ddp_comm_hook_nccl_grad_is_view(self): - self._test_powerSGD_ddp_comm_hook_nccl(gradient_as_bucket_view=True) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self): - """ - This unit test verifies whether a DDP communication hook that calls allreduce and then - multiplies the result by ten and divides by two gives the expected result. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - def allreduce_with_then_hook( - state: object, bucket: dist.GradBucket - ) -> torch.futures.Future: - tensors = [bucket.get_tensor() / self.world_size] - fut = process_group.allreduce(tensors).get_future() - - def mult(fut): - # Multiply the result by 10. - return [10 * t for t in fut.value()] - - def div(fut): - # Divide the result by 2. - return [0.5 * t for t in fut.value()] - - return fut.then(mult).then(div) - - # Get GPU model with allreduce_with_then_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook( - process_group, allreduce_with_then_hook - ) - - # check whether the grads are equal to what allreduce returns multuplied by 5. - # without the comm_hook, result would be still 0.25 * torch.ones(2, 2). - self._run_and_verify_hook(gpu_model, 8, 1.25 * torch.ones(2, 2)) - - @requires_gloo() - def test_ddp_invalid_comm_hook_init(self): - """ - This unit test makes sure that register_comm_hook properly checks the format - of hook defined by user. The Python hook must be callable. This test also - checks whether bucket annotation checked properly if defined. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - model = DistributedDataParallel( - ModuleForDdpCommHook(), process_group=process_group - ) - - with self.assertRaisesRegex(TypeError, "Communication hook must be callable."): - model.register_comm_hook(state=None, hook=1) - - with self.assertRaisesRegex( - ValueError, "bucket annotation should be dist.GradBucket." - ): - - def comm_hook(state: object, bucket: int) -> torch.futures.Future: - return torch.futures.Future() - - model.register_comm_hook(state=None, hook=comm_hook) - - @requires_gloo() - def test_ddp_invalid_comm_hook_return_type(self): - """ - This test checks whether return annotation checked properly if defined. It also - checks whether an internal error is thrown if return type is incorrect and user - hasn't specified any return type annotation. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - model = DistributedDataParallel( - ModuleForDdpCommHook(), process_group=process_group - ) - - with self.assertRaisesRegex( - ValueError, - "Communication hook: return annotation should be torch.futures.Future or torch._C.Future.", - ): - - def comm_hook(state: object, bucket: dist.GradBucket) -> int: - return torch.futures.Future() - - model.register_comm_hook(state=None, hook=comm_hook) - - with self.assertRaisesRegex( - RuntimeError, - "callback must return a torch.futures.Future or torch._C.Future object, but got", - ): - - def comm_hook(state: object, bucket: dist.GradBucket): - return 1 - - model.register_comm_hook(state=None, hook=comm_hook) - - # Run forward - output = model(8, self.rank) - - # Run backward - output.mean().backward() - - @requires_gloo() - def test_ddp_comm_hook_register_just_once(self): - """ - DDP communication hook can only be registered once. This test validates whether - the error is thrown properly when register_comm_hook is called more than once. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - model = DistributedDataParallel( - ModuleForDdpCommHook(), process_group=process_group - ) - - def dummy_hook(state, bucket): - fut = torch.futures.Future() - fut.set_result([bucket.get_tensor()]) - return fut - - model.register_comm_hook(None, dummy_hook) - - with self.assertRaisesRegex( - RuntimeError, - "register_comm_hook or register_builtin_comm_hook can only be called once.", - ): - model.register_comm_hook(None, dummy_hook) - - @requires_gloo() - def test_ddp_comm_hook_sparse_gradients(self): - """ - Runs "test_sparse_gradients" unit test with DDP communication hook. We define a - simple hook that does allreduce and works with gloo backend for this test. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - - # Ensure initialized weights and inputs are identical across processes - torch.manual_seed(1337) - - vanilla_model = SparseGradientModule() - ddp_model = DistributedDataParallel( - copy.deepcopy(vanilla_model), - process_group=process_group, - ) - - # "get_future" API does not support gloo backend, see GH Issue #42048. - # Instead, we wait for an allreduce work, and write its result to a Future. - def allreduce_hook_gloo( - state: object, bucket: dist.GradBucket - ) -> torch.futures.Future: - # Prepare allreduced grad bucket tensors by running an async work. - work = process_group.allreduce([bucket.get_tensor()]) - work.wait() - - fut = torch.futures.Future() - fut.set_result([bucket.get_tensor() / self.world_size]) - return fut - - ddp_model.register_comm_hook(None, allreduce_hook_gloo) - - self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) - - class AcceptsParam(torch.nn.Module): - def __init__(self, p, factor): - super().__init__() - self.a = p - self.f = factor - - def forward(self, input): - return input + self.a * self.f - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_weight_sharing(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - size = 2048 * 2048 - dev = self.rank - world = self.world_size - - p = torch.nn.Parameter(torch.randn(size, requires_grad=True)) - - for try_set_to_none, use_bucket_view in product((False, True), (False, True)): - m = torch.nn.Sequential(self.AcceptsParam(p, dev + 1), - self.AcceptsParam(p, dev + 1)).cuda(dev) - - m = torch.nn.parallel.DistributedDataParallel(m, - bucket_cap_mb=1, - gradient_as_bucket_view=use_bucket_view, - device_ids=[dev], - process_group=process_group) - - for i in range(3): - m.zero_grad(set_to_none=try_set_to_none) - m(1).sum().backward() - - # Each param value is multiplied by "rank + 1" twice in forward, so the grad - # values produced by a particular rank should be 2. * (rank + 1). - # Summing these over ranks and dividing by world size gives the expected result: - analytic = torch.full_like(p, 2. * (world * (world + 1.) / 2.) / world, device=dev) - for name, p in m.named_parameters(): - self.assertEqual(p.grad, analytic, "mismatch at " + name + ".grad for " + - "set_to_none = {}, use_bucket_view = {}".format(try_set_to_none, - use_bucket_view)) - - # A list of tests for ddp with activation checkpointing - # when gradient_as_bucket_view=True, False. - # Most of the tests are referred to - # https://github.com/facebookresearch/fairscale/blob/master/tests/nn/pipe/test_checkpoint_ddp.py - class CheckpointOnceModule(nn.Module): - def __init__(self): - super().__init__() - self.l1 = nn.Linear(2000, 2000) - self.l2 = nn.Linear(2000, 2000) - - def forward(self, inp): - x = self.l1(inp) - x = checkpoint(self.l2, x) - return x - - class CheckpointTwiceModule(CheckpointOnceModule): - def __init__(self): - super().__init__() - - def forward(self, inp): - x = self.l1(inp) - x = checkpoint(self.l2, x) - x = checkpoint(self.l2, x) - return x - - def _test_ddp_checkpointing(self, checkpoint_once, process_group, use_bucket_view, find_unused_parameters=False): - # to reprodce the same training results - torch.cuda.set_device(self.rank) - torch.manual_seed(31415) - if checkpoint_once: - model = self.CheckpointOnceModule().cuda() - else: - model = self.CheckpointTwiceModule().cuda() - model = nn.parallel.DistributedDataParallel(model, - bucket_cap_mb=1, - gradient_as_bucket_view=use_bucket_view, - device_ids=[self.rank], - process_group=process_group, - find_unused_parameters=find_unused_parameters) - input_tensor = torch.rand((64, 2000), device="cuda", requires_grad=True) - output_tensor = model(input_tensor) - output_tensor.sum().backward() - return model - - # DDP works as expect when layer is checkpointed only once - @requires_nccl() - @unittest.skip("TODO: Test is always failing - https://github.com/pytorch/pytorch/issues/55071") - def test_ddp_checkpointing_once(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - model = self._test_ddp_checkpointing(checkpoint_once=True, - process_group=process_group, - use_bucket_view=use_bucket_view) - norm = 0.0 - for p in model.parameters(): - self.assertTrue(p.grad is not None) - norm += p.grad.norm().item() - assert numpy.allclose(norm, 78053), norm - - # DDP will fail when there are unused_parameters in the model - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_checkpointing_unused_params(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - with self.assertRaisesRegex( - RuntimeError, - "Expected to mark a variable ready only once.", - ): - model = self._test_ddp_checkpointing(checkpoint_once=True, - process_group=process_group, - use_bucket_view=use_bucket_view, - find_unused_parameters=True) - - # DDP will fail when the same layer is checkponted twice - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_checkpointing_twice(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - with self.assertRaisesRegex( - RuntimeError, - "Expected to mark a variable ready only once.", - ): - model = self._test_ddp_checkpointing(checkpoint_once=False, - process_group=process_group, - use_bucket_view=use_bucket_view, - find_unused_parameters=True) - - # DDP works as expected if there is weight sharing among layers - @requires_nccl() - @unittest.skip("TODO: Test is always failing - https://github.com/pytorch/pytorch/issues/55071") - def test_ddp_checkpointing_weight_sharing(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - torch.cuda.set_device(self.rank) - for use_bucket_view in (True, False): - torch.manual_seed(31415) - l1 = nn.Linear(2000, 2000) - l2 = nn.Linear(2000, 2000) - l1.weight = l2.weight - model = nn.Sequential(l1, l2).cuda() - model = nn.parallel.DistributedDataParallel(model, - bucket_cap_mb=1, - gradient_as_bucket_view=use_bucket_view, - device_ids=[self.rank], - process_group=process_group) - input_tensor = torch.rand((64, 2000), device="cuda", requires_grad=True) - output_tensor = checkpoint(model, input_tensor) - output_tensor.sum().backward() - norm = 0.0 - for p in model.parameters(): - self.assertTrue(p.grad is not None) - norm += p.grad.norm().item() - assert numpy.allclose(norm, 57004), norm - - -class ReducerModule(nn.Module): - def __init__(self): - super(ReducerModule, self).__init__() - self.fc1 = nn.Linear(2, 10, bias=False) - self.fc2 = nn.Linear(10, 4, bias=False) - self.fc3 = nn.Linear(4, 4, bias=False) - self.relu = nn.ReLU() - - def forward(self, x, use_fc3=True): - x = self.relu(self.fc1(x)).float() - x = self.relu(self.fc2(x)).float() - if use_fc3: - x = self.fc3(x).float() - return F.softmax(x, dim=1) - - -@requires_gloo() -class ReducerTest(TestCase): - def setUp(self): - self.file = tempfile.NamedTemporaryFile(delete=False) - self.store = c10d.FileStore(self.file.name, 1) - self.process_group = c10d.ProcessGroupGloo(self.store, 0, 1) - - def test_single_dtype_single_bucket(self): - model = ReducerModule() - parameters = list(model.parameters()) - buckets = [list(range(len(parameters)))] - dist.Reducer([parameters], buckets, self.process_group) - - def _create_mixed_precision_model(self): - model = ReducerModule() - model.float() - model.fc1.double() - return model - - def test_multi_dtype_single_bucket(self): - model = self._create_mixed_precision_model() - - # Raise if there are multiple types per bucket. - # In this case we create one bucket for all parameters. - with self.assertRaises(RuntimeError): - parameters = [list(model.parameters())] - buckets = [list(range(len(parameters[0])))] - dist.Reducer(parameters, buckets, self.process_group) - - def test_multi_dtype_multi_bucket(self): - model = self._create_mixed_precision_model() - parameters = [list(model.parameters())] - group_by_dtype = groupby( - range(len(parameters[0])), key=lambda i: parameters[0][i].dtype - ) - buckets = [list(indices) for _, indices in group_by_dtype] - dist.Reducer(parameters, buckets, self.process_group) - - def _create_reducer_for_models(self, models, find_unused_parameters=False): - parameters = [list(model.parameters()) for model in models] - group_by_dtype = groupby( - range(len(parameters[0])), key=lambda i: parameters[0][i].dtype - ) - buckets = [list(indices) for _, indices in group_by_dtype] - return dist.Reducer( - parameters, - buckets, - self.process_group, - find_unused_parameters=find_unused_parameters, - ) - - def test_reducer_no_multi_replicas(self): - num_replicas = 2 - models = [self._create_mixed_precision_model() for _ in range(num_replicas)] - with self.assertRaisesRegex( - RuntimeError, - "Expected exactly one model replica.", - ): - reducer = self._create_reducer_for_models(models) - - def test_forward_backward(self): - batch_size = 10 - model = self._create_mixed_precision_model() - reducer = self._create_reducer_for_models([model]) - reducer.prepare_for_forward() - loss = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.double) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) - output = loss(model(input), target) - reducer.prepare_for_backward(output) - output.backward() - - def test_forward_backward_unused_parameters(self): - batch_size = 10 - model = self._create_mixed_precision_model() - reducer = self._create_reducer_for_models([model], find_unused_parameters=True) - reducer.prepare_for_forward() - loss = nn.CrossEntropyLoss() - input = torch.rand([batch_size, 2], dtype=torch.double) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) - output = loss(model(input, use_fc3=False), target) - - # Check that the grad of fc3 is not set. - self.assertEqual(None, model.fc3.weight.grad) - - # Compute and accumulate gradients. - reducer.prepare_for_backward(output) - output.backward() - - # The reducer will have marked the grad of fc3 as ready, because - # it doesn't show up in the autograd graph of `output`. Since fc3.weight - # is considered being globally unused, it will be kept untouched as None. - self.assertEqual(None, model.fc3.weight.grad) - - def test_forward_backward_optimizer(self): - batch_size = 10 - model = self._create_mixed_precision_model() - reducer = self._create_reducer_for_models([model], find_unused_parameters=True) - reducer.prepare_for_forward() - loss = nn.CrossEntropyLoss() - optimizer = torch.optim.Adam(model.parameters()) - for i in range(3): - input = torch.rand([batch_size, 2], dtype=torch.double) - target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) - - # The `zero_grad` function calls `detach_` and `zero_` on the grad - # tensors of model parameters. If we tried to set the grad tensors - # to a view of the reducer's bucket tensors, this would blow up. - optimizer.zero_grad() - - # Unused parameter only in the first iteration. - output = loss(model(input, use_fc3=(i > 0)), target) - reducer.prepare_for_backward(output) - output.backward() - optimizer.step() - - -class ComputeBucketAssignmentTest(TestCase): - def test_single_limit_single_dtype(self): - tensors = [ - torch.empty([100], dtype=torch.float), - torch.empty([200], dtype=torch.float), - torch.empty([100], dtype=torch.float), - torch.empty([50], dtype=torch.float), - ] - result = dist._compute_bucket_assignment_by_size(tensors, [400]) - self.assertEqual([[0], [1], [2], [3]], result) - - def test_single_limit_multi_dtype(self): - tensors = [ - torch.empty([50], dtype=torch.float), - torch.empty([25], dtype=torch.double), - torch.empty([50], dtype=torch.float), - torch.empty([25], dtype=torch.double), - torch.empty([50], dtype=torch.float), - torch.empty([25], dtype=torch.double), - ] - result = dist._compute_bucket_assignment_by_size(tensors, [400]) - self.assertEqual([[0, 2], [1, 3], [4], [5]], result) - - def test_multi_limit_single_dtype(self): - tensors = [ - torch.empty([10], dtype=torch.float), - torch.empty([10], dtype=torch.float), - torch.empty([10], dtype=torch.float), - torch.empty([10], dtype=torch.float), - ] - result = dist._compute_bucket_assignment_by_size(tensors, [40, 80]) - self.assertEqual([[0], [1, 2], [3]], result) - - def test_multi_limit_multi_dtype(self): - tensors = [ - torch.empty([50], dtype=torch.float), - torch.empty([25], dtype=torch.double), - torch.empty([50], dtype=torch.float), - torch.empty([25], dtype=torch.double), - torch.empty([50], dtype=torch.float), - torch.empty([25], dtype=torch.double), - ] - result = dist._compute_bucket_assignment_by_size(tensors, [200, 400]) - self.assertEqual([[0], [1], [2, 4], [3, 5]], result) - - -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) -class NcclErrorHandlingTest(MultiProcessTestCase): - def setUp(self): - super(NcclErrorHandlingTest, self).setUp() - # Need to skip return code checking for these tests since the child - # processes don't exit cleanly. - self.skip_return_code_checks = [ - self.test_nccl_errors_blocking_abort.__wrapped__, - self.test_nccl_errors_blocking_sigkill.__wrapped__, - self.test_nccl_errors_blocking_sigterm.__wrapped__, - self.test_nccl_errors_blocking_nonzero_exit.__wrapped__, - ] - # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests - # that use NCCL_BLOCKING_WAIT will test it as expected. - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" - self._fork_processes() - - def tearDown(self): - super(NcclErrorHandlingTest, self).tearDown() - try: - os.remove(self.file_name) - except OSError: - pass - - @property - def op_timeout_sec(self): - return 1 - - @property - def world_size(self): - return 3 - - @property - def blocking_wait_error_msg(self): - return "Caught collective operation timeout" - - def _run_all_reduce(self, pg): - pg.allreduce(torch.rand(10).cuda(self.rank)) - - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - @skip_if_rocm - def test_nccl_errors_nonblocking(self): - # Note: we unset and restore NCCL_ASYNC_ERROR_HANDLING for this test - # since test_c10d runs with async error handling by default, but this - # tests behavior when it is not enabled. - prev_nccl_async_error_handling = os.environ.get("NCCL_ASYNC_ERROR_HANDLING", None) - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - process_group.allreduce(torch.rand(10).cuda(self.rank)) - if self.rank == 0: - # This allreduce does not block Python thread as allreduce enqueues - # the cuda operation, and then wait only blocks the current cuda - # stream. - work = process_group.allreduce(torch.rand(10).cuda(self.rank)) - work.wait() - - # Now the work scheduled next should hang forever since the previous - # allreduce will never complete. - t = threading.Thread(target=self._run_all_reduce, args=(process_group,)) - t.daemon = True - t.start() - t.join(int(get_timeout(self.id()) / 5)) - self.assertTrue(t.is_alive()) - - if prev_nccl_async_error_handling is not None: - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = prev_nccl_async_error_handling - - def _test_nccl_errors_blocking(self, func): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL( - store, - self.rank, - self.world_size, - timeout=timedelta(seconds=self.op_timeout_sec), - ) - process_group.allreduce(torch.rand(10).cuda(self.rank)) - if self.rank == 0: - work = process_group.allreduce(torch.rand(10).cuda(self.rank)) - with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg): - # Operation would time out in blocking mode. - work.wait() - # Run some GPU operations to make sure cuda has not gotten stuck. - # It was observed cuda could get stuck if NCCL communicators were - # not properly aborted before throwing RuntimeError. - a = torch.rand(10).cuda(self.rank) - elif self.rank == 1: - # Clean up structures (ex: files for FileStore before going down) - del process_group - func() - else: - # Wait for timeout - time.sleep(2 * self.op_timeout_sec) - - # Now verify communicators on this rank have been aborted by the watchdog thread. - self._wait_for_comm_abort(process_group) - - @with_nccl_blocking_wait - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - @skip_if_rocm - def test_nccl_errors_blocking_clean_exit(self): - self._test_nccl_errors_blocking(lambda: sys.exit(0)) - - @with_nccl_blocking_wait - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - @skip_if_rocm - def test_nccl_errors_blocking_nonzero_exit(self): - self._test_nccl_errors_blocking(lambda: sys.exit(1)) - - @with_nccl_blocking_wait - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - @skip_if_rocm - def test_nccl_errors_blocking_abort(self): - self._test_nccl_errors_blocking(lambda: os.abort()) - - @with_nccl_blocking_wait - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - @skip_if_rocm - def test_nccl_errors_blocking_sigkill(self): - self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL)) - - @with_nccl_blocking_wait - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - @skip_if_rocm - def test_nccl_errors_blocking_sigterm(self): - self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM)) - - @with_nccl_blocking_wait - @requires_nccl() - @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") - @skip_if_lt_x_gpu(3) - def test_nccl_blocking_wait_with_barrier(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL( - store, - self.rank, - self.world_size, - timeout=timedelta(seconds=self.op_timeout_sec), - ) - process_group.barrier().wait() - if self.rank == 0: - with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg): - # This should timeout - process_group.barrier().wait() - - def _run_invalid_nccl_blocking_wait_env(self, val): - os.environ["NCCL_BLOCKING_WAIT"] = val - store = c10d.FileStore(self.file_name, self.world_size) - with self.assertRaises(RuntimeError): - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - - @requires_nccl() - @skip_if_lt_x_gpu(3) - def test_invalid_nccl_blocking_wait_env(self): - self._run_invalid_nccl_blocking_wait_env("abc") - self._run_invalid_nccl_blocking_wait_env("-1") - self._run_invalid_nccl_blocking_wait_env("2147483647") - self._run_invalid_nccl_blocking_wait_env("4294967295") - - def _wait_for_comm_abort(self, process_group): - """ - Waits for the watchdog thread to abort communicators for the process group. - """ - while True: - try: - process_group.allreduce(torch.rand(10).cuda(self.rank)) - except Exception as e: - if "NCCL communicator was aborted" in str(e): - return - else: - raise e - time.sleep(1) - - @with_nccl_blocking_wait - @requires_nccl() - @skip_if_lt_x_gpu(3) - def test_nccl_timeout(self): - store = c10d.FileStore(self.file_name, self.world_size) - - # Initialize process_group. - timeout = 1 - process_group = c10d.ProcessGroupNCCL( - store, self.rank, self.world_size, timeout=timedelta(seconds=timeout) - ) - process_group.allreduce(torch.rand(10).cuda(self.rank)).wait() - - if self.rank == 0: - # This should timeout in about 1 second. - start = time.time() - # Watchdog may abort timed out work resulting in NCCL error instead of operation timed out. - with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg): - process_group.allreduce(torch.rand(10).cuda(self.rank)).wait() - else: - # Sleep to ensure timeout. - time.sleep(2 * timeout) - - self._wait_for_comm_abort(process_group) - - -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) -class CommTest(MultiProcessTestCase): - def setUp(self): - super(CommTest, self).setUp() - # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests - # that use NCCL_BLOCKING_WAIT will test it as expected. - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" - if sys.platform == "win32": - self._spawn_processes() - else: - self._fork_processes() - - def tearDown(self): - super(CommTest, self).tearDown() - try: - os.remove(self.file_name) - except OSError: - pass - - @property - def op_timeout_sec(self): - return 1 - - @property - def world_size(self): - return 2 - - def _test_broadcast_coalesced(self, process_group, device, root_rank): - half = torch.float16 - - # No support for float16 for CPU tensors - if device == torch.device("cpu"): - half = torch.float32 - - target = torch.arange(60, dtype=half, device=device).chunk(5) - target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) - target += torch.arange(60, dtype=half, device=device).chunk(5) - target += torch.arange(60, dtype=torch.float64, device=device).chunk(5) - target += torch.arange(60, dtype=half, device=device).chunk(5) - target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) - - # The tensors to pass to broadcast are idential to the target - # only on the process that is the root of the broadcast. - if self.rank == root_rank: - tensors = list(tensor.clone() for tensor in target) - else: - tensors = list(torch.zeros_like(tensor) for tensor in target) - - if self.rank != root_rank: - self.assertNotEqual(tensors, target) - - c10d._broadcast_coalesced( - process_group, tensors, buffer_size=256, src=root_rank - ) - - if self.rank != root_rank: - self.assertEqual(tensors, target) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_broadcast_coalesced_nccl(self): - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - device = torch.device("cuda:%d" % self.rank) - ranks = [0, 1] - for root_rank in ranks: - self._test_broadcast_coalesced(process_group, device, root_rank) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_broadcast_coalesced_gloo_cuda(self): - store = c10d.FileStore(self.file_name, self.world_size) - options = c10d.ProcessGroupGloo._Options() - options._devices = [create_device(interface=LOOPBACK)] - process_group = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, options - ) - device = torch.device("cuda:%d" % self.rank) - ranks = list(range(self.world_size)) - for root_rank in ranks: - self._test_broadcast_coalesced(process_group, device, root_rank) - - @requires_gloo() - def test_broadcast_coalesced_gloo_cpu(self): - store = c10d.FileStore(self.file_name, self.world_size) - options = c10d.ProcessGroupGloo._Options() - options._devices = [create_device(interface=LOOPBACK)] - process_group = c10d.ProcessGroupGloo( - store, self.rank, self.world_size, options - ) - device = torch.device("cpu") - ranks = list(range(self.world_size)) - for root_rank in ranks: - self._test_broadcast_coalesced(process_group, device, root_rank) - - def _test_sequence_num_set_default_pg(self, backend): - store = c10d.FileStore(self.file_name, self.world_size) - dist.init_process_group( - backend, - world_size=self.world_size, - rank=self.rank, - store=store, - ) - - default_pg = c10d.distributed_c10d._get_default_group() - seq_num = default_pg._get_sequence_number_for_group() - obj_list = [None for _ in range(dist.get_world_size())] - dist.all_gather_object(obj_list, seq_num) - self.assertEqual(len(set(obj_list)), 1) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_sequence_num_set_default_pg_gloo(self): - self._test_sequence_num_set_default_pg(backend="gloo") - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_sequence_num_set_default_pg_nccl(self): - torch.cuda.set_device(self.rank) - self._test_sequence_num_set_default_pg(backend="nccl") - - def _test_sequence_num_set_new_group(self, backend): - store = c10d.FileStore(self.file_name, self.world_size) - dist.init_process_group( - backend, - world_size=self.world_size, - rank=self.rank, - store=store, - ) - - subgroup = dist.new_group([0, 1]) - subgroup_seq = subgroup._get_sequence_number_for_group() - obj_list = [None for _ in range(dist.get_world_size())] - dist.all_gather_object(obj_list, subgroup_seq) - self.assertEqual(len(set(obj_list)), 1) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_sequence_num_set_gloo_new_group(self): - self._test_sequence_num_set_new_group(backend="gloo") - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_sequence_num_set_nccl_new_group(self): - torch.cuda.set_device(self.rank) - self._test_sequence_num_set_new_group(backend="nccl") - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_pass_nccl_options_high_priority_stream(self): - pg_opts = c10d.ProcessGroupNCCL.Options() - pg_opts.is_high_priority_stream = True - - store = c10d.FileStore(self.file_name, self.world_size) - # Test init_process_group accepts options - dist.init_process_group( - "nccl", - world_size=self.world_size, - rank=self.rank, - store=store, - pg_options=pg_opts - ) - - # Test with new_group - pg = c10d.new_group([0, 1], pg_options=pg_opts) - # test if the process group constructed with high priority stream - self.assertTrue(pg.options.is_high_priority_stream) - # test the process group works as expected - t = torch.tensor([self.rank + 1] * 10).cuda(self.rank) - pg.allreduce(t).wait() - expected_tensor = torch.tensor([3] * 10).cuda(self.rank) - self.assertEqual(expected_tensor, t) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_nccl_barrier(self): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="nccl", rank=self.rank, world_size=self.world_size, store=store - ) - - t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) - c10d.all_reduce(t) - expected_tensor = torch.tensor([3] * 10).cuda(2 * self.rank) - self.assertEqual(expected_tensor, t) - - # Test with new_group - pg = c10d.new_group([0, 1]) - t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) - pg.allreduce(t).wait() - self.assertEqual(expected_tensor, t) - - pg = c10d.new_group([0]) - if self.rank == 0: - t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) - expected_tensor = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) - pg.allreduce(t).wait() - self.assertEqual(expected_tensor, t) - - pg = c10d.new_group([1]) - if self.rank == 1: - t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) - expected_tensor = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) - pg.allreduce(t).wait() - self.assertEqual(expected_tensor, t) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_nccl_barrier_timeout(self): - store = c10d.FileStore(self.file_name, self.world_size) - if self.rank == 0: - with self.assertRaisesRegex( - RuntimeError, "Timed out initializing process group" - ): - c10d.init_process_group( - backend="nccl", - rank=self.rank, - world_size=self.world_size, - store=store, - timeout=timedelta(seconds=1), - ) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_nccl_barrier_timeout_new_group(self): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="nccl", - rank=self.rank, - world_size=self.world_size, - store=store, - timeout=timedelta(seconds=1), - ) - - if self.rank == 0: - with self.assertRaisesRegex( - RuntimeError, "Timed out initializing process group" - ): - c10d.new_group([0, 1], timeout=timedelta(seconds=1)) - - with self.assertRaisesRegex( - RuntimeError, "Timed out initializing process group" - ): - c10d.new_group([0], timeout=timedelta(seconds=1)) - - @requires_nccl() - @skip_if_lt_x_gpu(4) - def test_nccl_barrier_timeout_new_group_non_member(self): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="nccl", - rank=self.rank, - world_size=self.world_size, - store=store, - timeout=timedelta(seconds=1), - ) - - if self.rank == 1: - with self.assertRaisesRegex( - RuntimeError, "Timed out initializing process group" - ): - c10d.new_group([0, 1], timeout=timedelta(seconds=1)) - - with self.assertRaisesRegex( - RuntimeError, "Timed out initializing process group" - ): - c10d.new_group([0], timeout=timedelta(seconds=1)) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_barrier_device_ids(self): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="nccl", rank=self.rank, world_size=self.world_size, store=store - ) - - c10d.barrier(device_ids=[self.rank]) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_nccl_barrier_device_ids_function_argument(self): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="nccl", rank=self.rank, world_size=self.world_size, store=store - ) - - with self.assertRaisesRegex(RuntimeError, "Invalid function argument"): - c10d.barrier(device_ids=self.rank) - - @requires_gloo() - def test_gloo_barrier_device_ids(self): - store = c10d.FileStore(self.file_name, self.world_size) - c10d.init_process_group( - backend="gloo", rank=self.rank, world_size=self.world_size, store=store - ) - - with self.assertRaisesRegex(RuntimeError, "device_ids not supported"): - c10d.barrier(device_ids=[self.rank]) - - def test_distributed_debug_mode(self): - # Default should be off - default_debug_mode = dist._get_debug_mode() - self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF) - mapping = { - "OFF": dist._DistributedDebugLevel.OFF, - "INFO": dist._DistributedDebugLevel.INFO, - "DETAIL": dist._DistributedDebugLevel.DETAIL, - } - invalid_debug_modes = ["foo", 0, 1, -1] - - for mode in mapping.keys(): - os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) - set_debug_mode = dist._get_debug_mode() - self.assertEqual( - set_debug_mode, - mapping[mode], - f"Expected {mode} to map to {mapping[mode]} but got {set_debug_mode}", - ) - - for mode in invalid_debug_modes: - os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) - with self.assertRaisesRegex(RuntimeError, "to be one of"): - dist._get_debug_mode() - -if __name__ == "__main__": - assert ( - not torch.cuda._initialized - ), "test_distributed must not have initialized CUDA context on main process" - - run_tests() diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py new file mode 100644 index 00000000000..d2c11e5ee13 --- /dev/null +++ b/test/distributed/test_c10d_common.py @@ -0,0 +1,970 @@ +import copy +import os +import random +import sys +import tempfile +import threading +import time +import traceback +import unittest +from datetime import timedelta +from itertools import product +from sys import platform + +import torch +import torch.distributed as c10d + +if not c10d.is_available(): + print("c10d not available, skipping tests", file=sys.stderr) + sys.exit(0) + +import torch.distributed as dist +import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD +import torch.multiprocessing as mp +import torch.nn.functional as F +import torch.testing._internal.common_utils as common +from torch import nn +from torch._six import string_classes +from torch.nn.parallel import DistributedDataParallel +from torch.testing._internal.common_distributed import ( + MultiProcessTestCase, + skip_if_win32, + create_tcp_store +) +from torch.testing._internal.common_utils import ( + TestCase, + load_tests, + run_tests, + retry_on_connect_failures, + ADDRESS_IN_USE, + CONNECT_TIMEOUT, + TEST_WITH_TSAN, + IS_WINDOWS, +) + +# load_tests from common_utils is used to automatically filter tests for +# sharding on sandcastle. This line silences flake warnings +load_tests = load_tests + +if platform == "darwin": + LOOPBACK = "lo0" +else: + LOOPBACK = "lo" + +DEFAULT_HOSTNAME = "localhost" + + +def gpus_for_rank(world_size): + """Multigpu tests are designed to simulate the multi nodes with multi + GPUs on each node. Nccl backend requires equal #GPUs in each process. + On a single node, all visible GPUs are evenly + divided to subsets, each process only uses a subset. + """ + visible_devices = list(range(torch.cuda.device_count())) + gpus_per_process = torch.cuda.device_count() // world_size + gpus_for_rank = [] + for rank in range(world_size): + gpus_for_rank.append( + visible_devices[rank * gpus_per_process: (rank + 1) * gpus_per_process] + ) + return gpus_for_rank + + +class StoreTestBase(object): + def _create_store(self, i): + raise RuntimeError("not implemented") + + def _test_set_get(self, fs): + fs.add("key", 1) + fs.add("key", 2) + fs.add("key", 3) + fs.set("key0", "value0") + fs.add("key3", 1) + fs.set("key1", "value1") + fs.add("key3", 2) + fs.set("key2", "value2") + fs.add("key3", 3) + fs.add("key3", 4) + fs.add("key3", 5) + fs.add("key3", 6) + self.assertEqual(fs.num_keys(), self.num_keys_total) + self.assertEqual(b"6", fs.get("key")) + self.assertEqual(b"value0", fs.get("key0")) + self.assertEqual(b"value1", fs.get("key1")) + self.assertEqual(b"value2", fs.get("key2")) + self.assertEqual(b"21", fs.get("key3")) + + def test_set_get(self): + self._test_set_get(self._create_store()) + + def test_compare_set(self): + store = self._create_store() + missing_key_result = store.compare_set("key0", "wrong_old_value", "new_value0") + self.assertEqual(b"wrong_old_value", missing_key_result) + + store.set("key0", "value0") + self.assertEqual(b"value0", store.get("key0")) + old_value_result = store.compare_set("key0", "wrong_old_value", "new_value0") + self.assertEqual(b"value0", old_value_result) + self.assertEqual(b"value0", store.get("key0")) + new_value_result = store.compare_set("key0", "value0", "new_value0") + self.assertEqual(b"new_value0", new_value_result) + self.assertEqual(b"new_value0", store.get("key0")) + + # This is the number of keys used in test_set_get. Adding this as a class + # property instead of hardcoding in the test since some Store + # implementations will have differing number of keys. In the base case, + # there will be 5 keys: key, key0, key1, key2, key3. + @property + def num_keys_total(self): + return 5 + + +class FileStoreTest(TestCase, StoreTestBase): + def setUp(self): + super(FileStoreTest, self).setUp() + self.file = tempfile.NamedTemporaryFile(delete=False) + + def _create_store(self): + store = c10d.FileStore(self.file.name, 1) + store.set_timeout(timedelta(seconds=300)) + return store + + +@skip_if_win32() +class HashStoreTest(TestCase, StoreTestBase): + def setUp(self): + super(HashStoreTest, self).setUp() + + def _create_store(self): + store = c10d.HashStore() + store.set_timeout(timedelta(seconds=300)) + return store + + +class PrefixFileStoreTest(TestCase, StoreTestBase): + def setUp(self): + super(PrefixFileStoreTest, self).setUp() + self.file = tempfile.NamedTemporaryFile(delete=False) + self.filestore = c10d.FileStore(self.file.name, 1) + self.prefix = "test_prefix" + self.filestore.set_timeout(timedelta(seconds=300)) + + def _create_store(self): + return c10d.PrefixStore(self.prefix, self.filestore) + + +class TCPStoreTest(TestCase, StoreTestBase): + def _create_store(self): + store = create_tcp_store() + store.set_timeout(timedelta(seconds=300)) + return store + + def test_address_already_in_use(self): + if sys.platform == "win32": + err_msg_reg = "Only one usage of each socket address*" + else: + err_msg_reg = "^Address already in use$" + with self.assertRaisesRegex(RuntimeError, err_msg_reg): + addr = DEFAULT_HOSTNAME + port = common.find_free_port() + + # Use noqa to silence flake8. + # Need to store in an unused variable here to ensure the first + # object is not destroyed before the second object is created. + store1 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 + store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 + + # The TCPStore has 6 keys in test_set_get. It contains the 5 keys added by + # the user and one additional key used for coordinate all the workers. + @property + def num_keys_total(self): + return 6 + + def _test_numkeys_delkeys(self, fs): + # We start off with one init key in the store to coordinate workers + self.assertEqual(fs.num_keys(), 1) + fs.add("key", 1) + fs.add("key", 2) + fs.add("key", 3) + fs.set("key0", "value0") + fs.add("key3", 1) + fs.set("key1", "value1") + self.assertEqual(fs.num_keys(), 5) + fs.delete_key("key") + self.assertEqual(fs.num_keys(), 4) + fs.set_timeout(timedelta(seconds=2)) + with self.assertRaises(RuntimeError): + fs.get("key") + fs.delete_key("key0") + fs.delete_key("key3") + self.assertEqual(fs.num_keys(), 2) + fs.set("key4", "value2") + self.assertEqual(fs.num_keys(), 3) + self.assertEqual(b"value1", fs.get("key1")) + self.assertEqual(b"value2", fs.get("key4")) + + def test_numkeys_delkeys(self): + self._test_numkeys_delkeys(self._create_store()) + + def _create_client(self, index, addr, port, world_size, messages): + try: + client_store = dist.TCPStore(addr, port, world_size, timeout=timedelta(seconds=10)) + self.assertEqual("value".encode(), client_store.get("key")) + client_store.set(f"new_key{index}", f"new_value{index}") + self.assertEqual(f"next_value{index}".encode(), + client_store.compare_set(f"new_key{index}", f"new_value{index}", f"next_value{index}")) + except Exception: + messages.put('Caught exception: \n{}exiting process with exit code: {}' + .format(traceback.format_exc(), MultiProcessTestCase.TEST_ERROR_EXIT_CODE)) + sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE) + + def _multi_worker_helper(self, world_size): + addr = DEFAULT_HOSTNAME + server_store = create_tcp_store(addr, world_size, wait_for_workers=False) + server_store.set("key", "value") + port = server_store.port + messages = mp.Queue() + processes = [] + num_proccesses = random.randint(3, 5) if world_size == -1 else world_size + for i in range(num_proccesses): + p = mp.Process(target=self._create_client, args=(i, addr, port, world_size, messages)) + processes.append(p) + p.start() + for p in processes: + p.join() + error_message = "" + while not messages.empty(): + error_message += messages.get() + "\n" + if any([p.exitcode != 0 for p in processes]): + raise RuntimeError(error_message) + + @unittest.skipIf( + IS_WINDOWS, "Skip test for windows due to multiprocessing library error when using windows spawn" + ) + def test_multi_worker_with_fixed_world_size(self): + self._multi_worker_helper(5) + + @unittest.skipIf( + IS_WINDOWS, "Skip test for windows due to multiprocessing library error when using windows spawn" + ) + def test_multi_worker_with_nonfixed_world_size(self): + self._multi_worker_helper(-1) + + +class PrefixTCPStoreTest(TestCase, StoreTestBase): + def setUp(self): + super(PrefixTCPStoreTest, self).setUp() + self.tcpstore = create_tcp_store() + self.prefix = "test_prefix" + self.tcpstore.set_timeout(timedelta(seconds=300)) + + def _create_store(self): + return c10d.PrefixStore(self.prefix, self.tcpstore) + + # The PrefixTCPStore has 6 keys in test_set_get. It contains the 5 keys + # added by the user and one additional key used for coordinate all the + # workers. + @property + def num_keys_total(self): + return 6 + + +class MyPythonStore(c10d.Store): + def __init__(self): + super(MyPythonStore, self).__init__() + self.store = dict() + + def set(self, key, value): + if not isinstance(key, string_classes): + raise AssertionError("Expected set to be called with string key") + if type(value) is not bytes: + raise AssertionError("Expected set to be called with bytes value") + self.store[key] = value + + def get(self, key): + value = self.store.get(key, b"") + if type(value) is not bytes: + raise AssertionError("Expected get to return bytes value") + return value + + def add(self, key, value): + new = int(self.store.get(key, 0)) + value + self.set(key, bytes(str(new).encode("utf-8"))) + return new + + +class PythonStoreTest(TestCase): + def setUp(self): + super(PythonStoreTest, self).setUp() + + def test_set_get(self): + # If we were to inherit from StoreTestBase and try to use + # its test_set_get function, we would exercise the Python + # API directly, instead of going through the C++ trampoline. + # We care about testing the C++ trampoline, so run the + # equivalent of StoreTestBase.test_set_get from C++. + # See `torch/csrc/distributed/c10d/init.cpp` for the definition + # of this test function. + c10d._test_python_store(MyPythonStore()) + + +class RendezvousTest(TestCase): + def test_unknown_handler(self): + with self.assertRaisesRegex(RuntimeError, "^No rendezvous handler"): + c10d.rendezvous("invalid://") + + +class RendezvousEnvTest(TestCase): + @retry_on_connect_failures + def test_nominal(self): + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(common.find_free_port()) + + # Single rank + os.environ["RANK"] = "0" + gen0 = c10d.rendezvous("env://") + store0, rank0, size0 = next(gen0) + self.assertEqual(0, rank0) + self.assertEqual(1, size0) + + store0.set("key0", "value0") + + # check with get + self.assertEqual(b"value0", store0.get("key0")) + + +class RendezvousFileTest(TestCase): + def test_common_errors(self): + with self.assertRaisesRegex(ValueError, "path missing"): + gen = c10d.rendezvous("file://?rank=0&world_size=1") + next(gen) + with self.assertRaisesRegex(ValueError, "rank parameter missing"): + gen = c10d.rendezvous("file:///tmp/foo?world_size=1") + next(gen) + with self.assertRaisesRegex(ValueError, "size parameter missing"): + gen = c10d.rendezvous("file:///tmp/foo?rank=0") + next(gen) + + def test_nominal(self): + with tempfile.NamedTemporaryFile(delete=False) as file: + url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' + gen0 = c10d.rendezvous(url + "&rank=0") + store0, rank0, size0 = next(gen0) + self.assertEqual(0, rank0) + self.assertEqual(2, size0) + gen1 = c10d.rendezvous(url + "&rank=1") + store1, rank1, size1 = next(gen1) + self.assertEqual(1, rank1) + self.assertEqual(2, size1) + + # Set value on both stores + store0.set("key0", "value0") + store1.set("key1", "value1") + + # Cross check with get + self.assertEqual(b"value0", store1.get("key0")) + self.assertEqual(b"value1", store0.get("key1")) + + +@skip_if_win32() +class RendezvousTCPTest(TestCase): + def create_tcp_url(self): + addr = DEFAULT_HOSTNAME + port = common.find_free_port() + url = "tcp://%s:%d?world_size=%d" % (addr, port, 1) + return url + + def test_common_errors(self): + with self.assertRaisesRegex(ValueError, "port number missing"): + gen = c10d.rendezvous("tcp://127.0.0.1?rank=0&world_size=1") + next(gen) + with self.assertRaisesRegex(ValueError, "rank parameter missing"): + gen = c10d.rendezvous("tcp://127.0.0.1:23456?world_size=1") + next(gen) + with self.assertRaisesRegex(ValueError, "size parameter missing"): + gen = c10d.rendezvous("tcp://127.0.0.1:23456?rank=0") + next(gen) + + @retry_on_connect_failures + def test_nominal(self): + url = self.create_tcp_url() + gen0 = c10d.rendezvous(url + "&rank=0") + store0, rank0, size0 = next(gen0) + self.assertEqual(0, rank0) + self.assertEqual(1, size0) + + # Set value on the single store + store0.set("key0", "value0") + + # check with get + self.assertEqual(b"value0", store0.get("key0")) + + @retry_on_connect_failures(connect_errors=(CONNECT_TIMEOUT, ADDRESS_IN_USE)) + def test_tcp_store_timeout_set(self): + url = self.create_tcp_url() + test_store_timeout = timedelta(seconds=10) + gen0 = c10d.rendezvous(url + "&rank=0", timeout=test_store_timeout) + store0, rank0, size0 = next(gen0) + # this should time out in 10s. If the timeout passed into rendezvous was + # not respected, it will take much longer to timeout. + start = time.time() + with self.assertRaisesRegex(RuntimeError, "Timeout"): + store0.get("nonexistant key") + + end = time.time() + time_diff = end - start + self.assertGreater(test_store_timeout.seconds * 10, time_diff) + + +class AbstractTimeoutTest(object): + def _test_store_timeout(self, backend, init_method, c2p): + try: + c10d.distributed_c10d.init_process_group( + backend=backend, + init_method=init_method, + world_size=1, + rank=0, + timeout=timedelta(seconds=1), + ) + default_store = c10d.distributed_c10d._get_default_store() + tik = time.time() + with self.assertRaisesRegex(RuntimeError, "Timeout"): + default_store.get("nonexistent key") + tok = time.time() + c10d.destroy_process_group() + c2p.append(float(tok - tik)) + except RuntimeError as e: + # catch "Address already in use" error and report it to the main + # thread + c2p.append(e) + + def _init_methods(self): + f = tempfile.NamedTemporaryFile(delete=False) + if sys.platform == "win32": + yield "file:///%s" % f.name.replace("\\", "/") + f.close() + else: + yield "file://%s" % f.name + f.close() + yield "tcp://127.0.0.1:%d" % common.find_free_port() + + def _test_default_store_timeout(self, backend): + for init_method in self._init_methods(): + c2p = [] + t = threading.Thread( + target=self._test_store_timeout, args=(backend, init_method, c2p) + ) + t.daemon = True + t.start() + t.join(5) + + self.assertEqual(1, len(c2p)) + if isinstance(c2p[0], float): + # waiting time should be 1s, use 3s to rule out false alarm + self.assertGreater(3, c2p[0]) + elif isinstance(c2p[0], RuntimeError): + # let @retry_on_connect_failures handle the error + raise c2p[0] + else: + raise RuntimeError("Unexpected type {}".format(type(c2p[0]))) + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 50, bias=False) + self.fc3 = nn.Linear(50, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return F.softmax(x, dim=1) + + +class DoubleGpuNet(nn.Module): + def __init__(self, gpus): + super(DoubleGpuNet, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False).to(gpus[0]) + self.fc2 = nn.Linear(10, 50, bias=False).to(gpus[1]) + self.fc3 = nn.Linear(50, 4, bias=False).to(gpus[1]) + self.relu = nn.ReLU() + self.no_grad_param = nn.Parameter( + torch.tensor([2, 2]).long(), requires_grad=False + ).to(gpus[0]) + + def forward(self, x): + dev0 = self.fc1.weight.device + dev1 = self.fc2.weight.device + x = self.relu(self.fc1(x.to(dev0))) + x = self.relu(self.fc2(x.to(dev1))) + x = self.fc3(x) + return F.softmax(x, dim=1).to(dev0) + + +class QuadraGpuNet(nn.Module): + def __init__(self, gpus): + super(QuadraGpuNet, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False).to(gpus[0]) + self.fc2 = nn.Linear(10, 50, bias=False).to(gpus[1]) + self.fc3 = nn.Linear(50, 4, bias=False).to(gpus[2]) + self.fc4 = nn.Linear(4, 4, bias=False).to(gpus[3]) + self.relu = nn.ReLU() + self.no_grad_param = nn.Parameter( + torch.tensor([2, 2]).long(), requires_grad=False + ).to(gpus[0]) + + def forward(self, x): + dev0 = self.fc1.weight.device + dev1 = self.fc2.weight.device + dev2 = self.fc3.weight.device + dev3 = self.fc4.weight.device + x = self.relu(self.fc1(x.to(dev0))) + x = self.relu(self.fc2(x.to(dev1))) + x = self.relu(self.fc3(x.to(dev2))) + x = self.fc4(x.to(dev3)) + return F.softmax(x, dim=1).to(dev0) + + +class ConvNet(nn.Module): + def __init__(self, gpus, layouts, dtypes): + super(ConvNet, self).__init__() + self.dtypes = dtypes + if isinstance(gpus, list): + self.layer_gpus = gpus + else: + gpus = [gpus] * 4 + self.conv0 = torch.nn.Conv2d(8, 16, (2, 2)).to( + device=gpus[0], memory_format=layouts[0], dtype=dtypes[0] + ) + self.conv1 = torch.nn.Conv2d(16, 32, (2, 2)).to( + device=gpus[1], memory_format=layouts[1], dtype=dtypes[1] + ) + self.conv2 = torch.nn.Conv2d(32, 16, (2, 2)).to( + device=gpus[2], memory_format=layouts[2], dtype=dtypes[2] + ) + self.conv3 = torch.nn.Conv2d(16, 8, (2, 2)).to( + device=gpus[3], memory_format=layouts[3], dtype=dtypes[3] + ) + + def forward(self, x): + x = x.to(self.dtypes[0]) + # Could say + # x = self.conv0(x).to(device=self.conv1.weight.device, dtype=self.dtypes[1]) + # etc. But I don't want to appeal to the weights' devices directly, because part of this test's purpose + # is to verify weights are where expected if the model gets replicated. + gpus = self.layer_gpus if hasattr(self, "layer_gpus") else [x.device] * 4 + x = self.conv0(x).to(device=gpus[1], dtype=self.dtypes[1]) + x = self.conv1(x).to(device=gpus[2], dtype=self.dtypes[2]) + x = self.conv2(x).to(device=gpus[3], dtype=self.dtypes[3]) + return self.conv3(x) + + +class Task(nn.Module): + def __init__(self): + super().__init__() + self.p = nn.Parameter(torch.ones(2, 2)) + + def forward(self, x): + return self.p + x + + +class ModuleForDdpCommHook(nn.Module): + def __init__(self): + super().__init__() + self.t0 = Task() + + def forward(self, x, rank): + return self.t0(x + rank) + + +class SparseGradientModule(nn.Module): + def __init__(self): + super(SparseGradientModule, self).__init__() + self.embedding = nn.EmbeddingBag(10, 10, sparse=True) + + def forward(self, x): + return F.softmax(self.embedding(x), dim=1) + + +class AbstractDistributedDataParallelTest(object): + def tearDown(self): + # DistributedDataParallel test doesn't seem to call FileStore destructor + # TODO: investigate this test and the test is known to have issues + # Use this hack to remove files for that test + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def world_size(self): + return 2 + + def _prepare_single_device_module( + self, + process_group, + devices, + device_ids, + global_batch_size, + gradient_as_bucket_view=False, + ): + model = Net() + device = devices[0] if devices else torch.device("cuda:%d" % self.rank) + ddp_model = DistributedDataParallel( + copy.deepcopy(model).to(device), + device_ids=device_ids, + process_group=process_group, + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + model.to(device) + + input = torch.randn(global_batch_size, 2).to(device) + target = torch.randn(global_batch_size, 4).to(device) + + return model, ddp_model, input, target + + def _prepare_multi_device_module( + self, + process_group, + devices, + device_ids, + global_batch_size, + gradient_as_bucket_view=False, + ): + self.assertTrue( + len(devices) == 2 or len(devices) == 4, + "unexpected devices for ddp tests {}".format(devices), + ) + if len(devices) == 2: + model = DoubleGpuNet(devices) + elif len(devices) == 4: + model = QuadraGpuNet(devices) + + ddp_model = DistributedDataParallel( + copy.deepcopy(model), + device_ids=device_ids, + process_group=process_group, + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + input = torch.randn(global_batch_size, 2).cuda(devices[0]) + target = torch.randn(global_batch_size, 4) + + return model, ddp_model, input, target + + def _test_ddp_with_process_group( + self, + process_group, + devices, + device_ids, + multi_device=False, + gradient_as_bucket_view=False, + ): + """ + Note: we pass down `device_ids` all the way to DistributedDataParallel + as part of the test. Below you find tests that either use a list of + integers, a list of `torch.Device` instances, or an empty list. + The `devices` argument is used to control placement of the model and + must always be specified as list of `torch.Device` instances. + """ + local_batch_size = 1 if devices is None else len(devices) + global_batch_size = self.world_size * local_batch_size + + if multi_device: + model, ddp_model, input, target = self._prepare_multi_device_module( + process_group, + devices, + device_ids, + global_batch_size, + gradient_as_bucket_view, + ) + ddp_logging_data = ddp_model.get_ddp_logging_data() + self.assertTrue(ddp_logging_data.is_multi_device_module) + else: + model, ddp_model, input, target = self._prepare_single_device_module( + process_group, + devices, + device_ids, + global_batch_size, + gradient_as_bucket_view, + ) + ddp_logging_data = ddp_model.get_ddp_logging_data() + self.assertFalse(ddp_logging_data.is_multi_device_module) + + def step_model(model, input, target): + model.train() + output = model(input) + loss = F.mse_loss(output, target.to(output.device)) + loss.backward() + + def update_parameters(model): + for param in model.parameters(): + with torch.no_grad(): + param -= param.grad + param.grad = None + + # check two model parameters over 2 iterations + for iteration in range(2): + # single cpu/gpu training + step_model(model, input, target) + + # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs + step_model( + ddp_model, + input[ + self.rank * local_batch_size: (self.rank + 1) * local_batch_size + ], + target[ + self.rank * local_batch_size: (self.rank + 1) * local_batch_size + ], + ) + + # Update weights and run a second iteration to shake out errors + update_parameters(model) + update_parameters(ddp_model) + self.assertEqual( + len(list(model.parameters())), len(list(ddp_model.parameters())) + ) + for i, j in zip(model.parameters(), ddp_model.parameters()): + self.assertEqual(i, j) + + # Shuffle the input so that DDP input is different + torch.manual_seed(1337 + iteration) + input = input[torch.randperm(global_batch_size)] + + def _gpu_model_with_ddp_comm_hook( + self, process_group, hook=None, gradient_as_bucket_view=False, state=None + ): + device_id = gpus_for_rank(self.world_size)[self.rank][0] + gpu_model = DistributedDataParallel( + ModuleForDdpCommHook().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + # Register a DDP communication hook if any. + if hook is not None: + gpu_model.register_comm_hook(state, hook) + + return gpu_model + + def _gpu_model_with_builtin_ddp_comm_hook( + self, process_group, hook=None, gradient_as_bucket_view=False + ): + device_id = gpus_for_rank(self.world_size)[self.rank][0] + gpu_model = DistributedDataParallel( + ModuleForDdpCommHook().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + # Register a built-in DDP communication hook if defined + if hook is not None: + gpu_model._register_builtin_comm_hook(hook) + + return gpu_model + + def _run_and_verify_hook(self, model, input, expected_grad): + # Run forward + output = model(input, self.rank) + + # Run backward + output.mean().backward() + + [self.assertEqual(p.grad, expected_grad) for p in model.parameters()] + + def _simple_hook( + self, state: object, bucket: dist.GradBucket + ) -> torch.futures.Future: + fut = torch.futures.Future() + fut.set_result([torch.ones_like(bucket.get_tensor())]) + + def fut_then(fut): + # Add ones to fut's result. + return [t + torch.ones_like(t) for t in fut.value()] + + return fut.then(fut_then) + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class DistributedDataParallelTest(AbstractDistributedDataParallelTest, MultiProcessTestCase): + + def setUp(self): + super(DistributedDataParallelTest, self).setUp() + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def test_invalid_powerSGD_state(self): + for start_powerSGD_iter, use_error_feedback, warm_start in product( + [0, 1], [True, False], [True, False] + ): + if not use_error_feedback and not warm_start: + continue + with self.assertRaisesRegex( + ValueError, + "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, " + "because PowerSGD can only be applied after the first two iterations in DDP.", + ): + state = powerSGD.PowerSGDState( + process_group=None, + matrix_approximation_rank=1, + start_powerSGD_iter=start_powerSGD_iter, + use_error_feedback=use_error_feedback, + warm_start=warm_start, + ) + + +class ComputeBucketAssignmentTest(TestCase): + def test_single_limit_single_dtype(self): + tensors = [ + torch.empty([100], dtype=torch.float), + torch.empty([200], dtype=torch.float), + torch.empty([100], dtype=torch.float), + torch.empty([50], dtype=torch.float), + ] + result = dist._compute_bucket_assignment_by_size(tensors, [400]) + self.assertEqual([[0], [1], [2], [3]], result) + + def test_single_limit_multi_dtype(self): + tensors = [ + torch.empty([50], dtype=torch.float), + torch.empty([25], dtype=torch.double), + torch.empty([50], dtype=torch.float), + torch.empty([25], dtype=torch.double), + torch.empty([50], dtype=torch.float), + torch.empty([25], dtype=torch.double), + ] + result = dist._compute_bucket_assignment_by_size(tensors, [400]) + self.assertEqual([[0, 2], [1, 3], [4], [5]], result) + + def test_multi_limit_single_dtype(self): + tensors = [ + torch.empty([10], dtype=torch.float), + torch.empty([10], dtype=torch.float), + torch.empty([10], dtype=torch.float), + torch.empty([10], dtype=torch.float), + ] + result = dist._compute_bucket_assignment_by_size(tensors, [40, 80]) + self.assertEqual([[0], [1, 2], [3]], result) + + def test_multi_limit_multi_dtype(self): + tensors = [ + torch.empty([50], dtype=torch.float), + torch.empty([25], dtype=torch.double), + torch.empty([50], dtype=torch.float), + torch.empty([25], dtype=torch.double), + torch.empty([50], dtype=torch.float), + torch.empty([25], dtype=torch.double), + ] + result = dist._compute_bucket_assignment_by_size(tensors, [200, 400]) + self.assertEqual([[0], [1], [2, 4], [3, 5]], result) + + +class AbstractCommTest(object): + + @property + def op_timeout_sec(self): + return 1 + + @property + def world_size(self): + return 2 + + def _test_sequence_num_set_default_pg(self, backend): + store = c10d.FileStore(self.file_name, self.world_size) + dist.init_process_group( + backend, + world_size=self.world_size, + rank=self.rank, + store=store, + ) + + default_pg = c10d.distributed_c10d._get_default_group() + seq_num = default_pg._get_sequence_number_for_group() + obj_list = [None for _ in range(dist.get_world_size())] + dist.all_gather_object(obj_list, seq_num) + self.assertEqual(len(set(obj_list)), 1) + + def _test_sequence_num_set_new_group(self, backend): + store = c10d.FileStore(self.file_name, self.world_size) + dist.init_process_group( + backend, + world_size=self.world_size, + rank=self.rank, + store=store, + ) + + subgroup = dist.new_group([0, 1]) + subgroup_seq = subgroup._get_sequence_number_for_group() + obj_list = [None for _ in range(dist.get_world_size())] + dist.all_gather_object(obj_list, subgroup_seq) + self.assertEqual(len(set(obj_list)), 1) + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class CommTest(AbstractCommTest, MultiProcessTestCase): + + def setUp(self): + super(CommTest, self).setUp() + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def tearDown(self): + super(CommTest, self).tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + def test_distributed_debug_mode(self): + # Default should be off + default_debug_mode = dist._get_debug_mode() + self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF) + mapping = { + "OFF": dist._DistributedDebugLevel.OFF, + "INFO": dist._DistributedDebugLevel.INFO, + "DETAIL": dist._DistributedDebugLevel.DETAIL, + } + invalid_debug_modes = ["foo", 0, 1, -1] + + for mode in mapping.keys(): + os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) + set_debug_mode = dist._get_debug_mode() + self.assertEqual( + set_debug_mode, + mapping[mode], + f"Expected {mode} to map to {mapping[mode]} but got {set_debug_mode}", + ) + + for mode in invalid_debug_modes: + os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) + with self.assertRaisesRegex(RuntimeError, "to be one of"): + dist._get_debug_mode() + + +if __name__ == "__main__": + assert ( + not torch.cuda._initialized + ), "test_distributed must not have initialized CUDA context on main process" + + run_tests() diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py new file mode 100644 index 00000000000..b53654f787c --- /dev/null +++ b/test/distributed/test_c10d_gloo.py @@ -0,0 +1,2072 @@ +import copy +import logging +import math +import operator +import os +import random +import sys +import tempfile +import unittest +from functools import reduce +from itertools import groupby + +import torch +import torch.distributed as c10d + +if not c10d.is_available(): + print("c10d not available, skipping tests", file=sys.stderr) + sys.exit(0) + +import torch.distributed as dist +import torch.nn.functional as F +import torch.testing._internal.common_utils as common +from torch import nn +from torch.nn.parallel import DistributedDataParallel +from torch.testing._internal.common_distributed import ( + MultiProcessTestCase, + requires_gloo, + skip_if_lt_x_gpu, + simple_sparse_reduce_tests, + skip_if_win32, + create_device, +) +from torch.testing._internal.common_utils import ( + TestCase, + run_tests, + retry_on_connect_failures, + TEST_WITH_TSAN, +) +import test_c10d_common +from test_c10d_common import LOOPBACK, gpus_for_rank, Task, ModuleForDdpCommHook, SparseGradientModule + + +def simple_reduce_tests(rank, world_size): + tests = [ + ( + c10d.ReduceOp.SUM, + torch.tensor([rank + 1.0]), + torch.tensor([float(world_size * (world_size + 1) / 2)]), + ), + ( + c10d.ReduceOp.PRODUCT, + torch.tensor([rank + 1.0]), + torch.tensor([float(math.factorial(world_size))]), + ), + ( + c10d.ReduceOp.MIN, + torch.tensor([rank + 1.0]), + torch.tensor([1.0]), + ), + ( + c10d.ReduceOp.MAX, + torch.tensor([rank + 1.0]), + torch.tensor([world_size]), + ), + ] + + # Generate tests for BAND. + # The bit that is set changes in every iteration to check + # that the output changes accordingly. + for i in range(4): + vin = rank | (1 << i) + vout = 1 << i + tests.append( + ( + c10d.ReduceOp.BAND, + torch.tensor([vin], dtype=torch.int32), + torch.tensor([vout], dtype=torch.int32), + ), + ) + + # Generate tests for BOR. + # These emulate a larger world size per iteration by having every + # rank contribute multiple values that are pre-OR'ed. + for i in range(1, 5): + vin = reduce(operator.or_, [rank * i + j for j in range(i)]) + vout = reduce(operator.or_, range(world_size * i)) + tests.append( + ( + c10d.ReduceOp.BOR, + torch.tensor([vin], dtype=torch.int32), + torch.tensor([vout], dtype=torch.int32), + ), + ) + + # Generate tests for XOR. + # These emulate a larger world size per iteration by having every + # rank contribute multiple values that are pre-XOR'ed. + for i in range(1, 5): + vin = reduce(operator.xor, [rank * i + j for j in range(i)]) + vout = reduce(operator.xor, range(world_size * i)) + tests.append( + ( + c10d.ReduceOp.BXOR, + torch.tensor([vin], dtype=torch.int32), + torch.tensor([vout], dtype=torch.int32), + ), + ) + + return tests + + +def simple_coalesced_reduce_tests(rank, world_size): + return [ + ( + c10d.ReduceOp.SUM, + [torch.tensor([rank + 1]), torch.tensor([(rank + 1) ** 2])], + [ + torch.tensor([float(world_size * (world_size + 1) / 2)]), + torch.tensor( + [float(world_size * (world_size + 1) * (2 * world_size + 1) / 6)] + ), + ], + ), + ( + c10d.ReduceOp.PRODUCT, + [torch.tensor([rank + 1.0]), torch.tensor([rank + 2.0])], + [ + torch.tensor([float(math.factorial(world_size))]), + torch.tensor([float(math.factorial(world_size + 1))]), + ], + ), + ( + c10d.ReduceOp.MIN, + [torch.tensor([rank + x]) for x in [0.0, 1.0]], + [torch.tensor([0.0]), torch.tensor([1.0])], + ), + ( + c10d.ReduceOp.MAX, + [torch.tensor([rank + x]) for x in [1.0, 2.0]], + [torch.tensor([world_size]), torch.tensor([world_size + 1.0])], + ), + ] + + +def simple_multi_input_reduce_tests(rank, world_size): + return [ + ( + c10d.ReduceOp.SUM, + [torch.tensor([2 * rank + 0.0]), torch.tensor([2 * rank + 1.0])], + torch.tensor([float(world_size * (2 * world_size - 1))]), + ), + ( + c10d.ReduceOp.PRODUCT, + [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])], + torch.tensor([float(math.factorial(2 * world_size))]), + ), + ( + c10d.ReduceOp.MIN, + [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])], + torch.tensor([1.0]), + ), + ( + c10d.ReduceOp.MAX, + [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])], + torch.tensor([2 * world_size]), + ), + ] + + +class RendezvousEnvTest(TestCase): + @retry_on_connect_failures + def test_logging_init(self): + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(common.find_free_port()) + os.environ["RANK"] = "0" + + previous_handlers = logging.root.handlers + + c10d.init_process_group(backend="gloo", init_method="env://") + + current_handlers = logging.root.handlers + self.assertEqual(len(previous_handlers), len(current_handlers)) + for current, previous in zip(current_handlers, previous_handlers): + self.assertEqual(current, previous) + + c10d.destroy_process_group() + + +class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): + @requires_gloo() + @retry_on_connect_failures + def test_default_store_timeout_gloo(self): + self._test_default_store_timeout("gloo") + + +@requires_gloo() +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class ProcessGroupGlooTest(MultiProcessTestCase): + def setUp(self): + super(ProcessGroupGlooTest, self).setUp() + + # For Windows platform, Python does not support fork, change it to spawn here. + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def opts(self, threads=2): + opts = c10d.ProcessGroupGloo._Options() + opts._timeout = 5.0 + opts._devices = [create_device(interface=LOOPBACK)] + opts._threads = threads + return opts + + def test_multi_device_constructor(self): + store = c10d.FileStore(self.file_name, self.world_size) + opts = c10d.ProcessGroupGloo._Options() + opts._timeout = 5.0 + opts._devices = [ + create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), + ] + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) + + # Execute 2x the number of operations to ensure we use every device. + for work in [pg.allreduce(torch.ones(i + 1)) for i in range(4)]: + work.wait() + + def test_empty_tensors(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + xs = [torch.FloatTensor([])] + pg.broadcast(xs).wait() + self.assertEqual(0, xs[0].numel()) + + def test_broadcast_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1], dtype=torch.float32) + t2 = torch.zeros([1], dtype=torch.float64) + t3 = torch.zeros([2], dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.BroadcastOptions() + opts.rootRank = -1 + opts.rootTensor = 0 + pg.broadcast([t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.BroadcastOptions() + opts.rootRank = self.world_size + opts.rootTensor = 0 + pg.broadcast([t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root tensor"): + opts = c10d.BroadcastOptions() + opts.rootRank = self.rank + opts.rootTensor = -1 + pg.broadcast([t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root tensor"): + opts = c10d.BroadcastOptions() + opts.rootRank = self.rank + opts.rootTensor = 1 + pg.broadcast([t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root tensor"): + opts = c10d.BroadcastOptions() + opts.rootRank = self.rank + opts.rootTensor = 0 + pg.broadcast([], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor type"): + opts = c10d.BroadcastOptions() + opts.rootRank = self.rank + opts.rootTensor = 0 + pg.broadcast([t1, t2], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + opts = c10d.BroadcastOptions() + opts.rootRank = self.rank + opts.rootTensor = 0 + pg.broadcast([t1, t3], opts) + + def _test_broadcast_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + def broadcast(xs, rootRank, rootTensor): + opts = c10d.BroadcastOptions() + opts.rootRank = rootRank + opts.rootTensor = rootTensor + work = pg.broadcast(xs, opts) + work.wait() + + # Every rank is root once + for i in range(self.world_size): + # Run with 1 input tensor + x = fn(torch.tensor([self.rank])) + broadcast([x], i, 0) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(torch.tensor([i]), x) + + # Run with 2 input tensors + num = 2 + for j in range(num): + xs = [ + fn(torch.tensor([self.rank * num + 0.0])), + fn(torch.tensor([self.rank * num + 1.0])), + ] + + broadcast(xs, i, j) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(torch.tensor([i * num + j]), xs[0]) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(torch.tensor([i * num + j]), xs[1]) + + # Test overloaded convenience function + x = torch.tensor([self.rank + 1.0]) + work = pg.broadcast(x, root=0) + work.wait() + self.assertEqual(torch.tensor([1.0]), x) + + def test_broadcast_basics(self): + self._test_broadcast_basics(lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_broadcast_basics_cuda(self): + self._test_broadcast_basics(lambda t: t.clone().cuda()) + + def _test_broadcast_stress(self, inputs): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + work_handles = [ + pg.broadcast(inputs[i], root=(i % self.world_size)) + for i in range(len(inputs)) + ] + for i, work_handle in enumerate(work_handles): + work_handle.wait() + self.assertEqual( + torch.tensor([(i * self.world_size) + (i % self.world_size)]), + inputs[i], + msg=("Mismatch in iteration %d" % i), + ) + + def test_broadcast_stress(self): + inputs = [torch.tensor([i * self.world_size + self.rank]) for i in range(1000)] + self._test_broadcast_stress(inputs) + + @skip_if_lt_x_gpu(2) + def test_broadcast_stress_cuda(self): + inputs = [ + torch.tensor([i * self.world_size + self.rank]).cuda() for i in range(1000) + ] + self._test_broadcast_stress(inputs) + + def test_allreduce_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1], dtype=torch.float32) + t2 = torch.zeros([1], dtype=torch.float64) + t3 = torch.zeros([2], dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + opts = c10d.AllreduceOptions() + pg.allreduce([], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor type"): + opts = c10d.AllreduceOptions() + pg.allreduce([t1, t2], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + opts = c10d.AllreduceOptions() + pg.allreduce([t1, t3], opts) + + def _test_allreduce_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + # Single input tests + tests = simple_reduce_tests(self.rank, self.world_size) + for (op, input, output) in tests: + opts = c10d.AllreduceOptions() + opts.reduceOp = op + tensor = fn(input) + work = pg.allreduce([tensor], opts) + work.wait() + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(output, tensor) + + # Multi input tests + tests = simple_multi_input_reduce_tests(self.rank, self.world_size) + for (op, inputs, output) in tests: + opts = c10d.AllreduceOptions() + opts.reduceOp = op + tensors = [fn(input) for input in inputs] + work = pg.allreduce(tensors, opts) + work.wait() + for tensor in tensors: + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(output, tensor) + + # Test overloaded convenience function (defaults to using sum) + x = fn(torch.tensor([self.rank + 1.0])) + work = pg.allreduce(x) + work.wait() + self.assertEqual( + torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]), x + ) + + def test_allreduce_basics(self): + self._test_allreduce_basics(lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_allreduce_basics_cuda(self): + self._test_allreduce_basics(lambda t: t.clone().cuda()) + + def _test_allreduce_stress(self, inputs): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + work_handles = [pg.allreduce(inputs[i]) for i in range(len(inputs))] + for i, work_handle in enumerate(work_handles): + work_handle.wait() + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + torch.tensor( + [ + (i * self.world_size) + + (self.world_size * (self.world_size - 1) / 2) + ] + ), + inputs[i], + msg=("Mismatch in iteration %d" % i), + ) + + def test_allreduce_stress(self): + inputs = [torch.tensor([i + self.rank]) for i in range(1000)] + self._test_allreduce_stress(inputs) + + @skip_if_lt_x_gpu(2) + def test_allreduce_stress_cuda(self): + inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] + self._test_allreduce_stress(inputs) + + def test_allreduce_coalesced_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros(1, dtype=torch.float32) + t2 = torch.zeros(1, dtype=torch.float64) + t3 = torch.sparse_coo_tensor([[0]], [1], size=(1,)) + + with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + opts = c10d.AllreduceCoalescedOptions() + pg.allreduce_coalesced([], opts) + + with self.assertRaisesRegex(ValueError, "tensors must all have the same type"): + opts = c10d.AllreduceCoalescedOptions() + pg.allreduce_coalesced([t1, t2], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor layout at index"): + opts = c10d.AllreduceCoalescedOptions() + pg.allreduce_coalesced([t1, t3], opts) + + with self.assertRaisesRegex(ValueError, "unsupported layout"): + opts = c10d.AllreduceCoalescedOptions() + pg.allreduce_coalesced([t3, t3.clone()], opts) + + @skip_if_lt_x_gpu(1) + def test_allreduce_coalesced_checks_cuda(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros(1, dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "unsupported device type"): + opts = c10d.AllreduceCoalescedOptions() + pg.allreduce_coalesced([t1.cuda(), t1.cuda()], opts) + + def _test_allreduce_coalesced_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + test_cases = simple_coalesced_reduce_tests(self.rank, self.world_size) + for op, inputs, outputs in test_cases: + opts = c10d.AllreduceCoalescedOptions() + opts.reduceOp = op + tensors = [fn(x) for x in inputs] + work = pg.allreduce_coalesced(tensors, opts) + work.wait() + for result_tensor, expected in zip(tensors, outputs): + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(result_tensor, expected) + + def test_allreduce_coalesced_basics(self): + self._test_allreduce_coalesced_basics(lambda t: t.clone()) + + def _test_allreduce_coalesced_stress(self, inputs): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + work_handles = [pg.allreduce_coalesced(input) for input in inputs] + for i, work_handle in enumerate(work_handles): + work_handle.wait() + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + 2 + * [ + torch.tensor( + [ + (i * self.world_size) + + (self.world_size * (self.world_size - 1) / 2) + ] + ) + ], + inputs[i], + msg="Mismatch in interation {}".format(i), + ) + + def test_allreduce_coalesced_stress(self): + inputs = [2 * [torch.tensor([i + self.rank])] for i in range(1000)] + self._test_allreduce_coalesced_stress(inputs) + + def test_sparse_allreduce_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1]) + t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,)) + t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,)) + + with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + opts = c10d.AllreduceOptions() + pg.allreduce([], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor layout"): + opts = c10d.AllreduceOptions() + pg.allreduce([t1, t2], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + opts = c10d.AllreduceOptions() + pg.allreduce([t2, t3], opts) + + # Sparse allreduce only works with c10d.ReduceOp.SUM. + for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]: + with self.assertRaisesRegex(ValueError, "unsupported reduction operation"): + opts = c10d.AllreduceOptions() + opts.reduceOp = op + pg.allreduce([t3], opts) + + def _test_sparse_allreduce_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + for num_inputs_per_rank in [1, 2]: + tests = simple_sparse_reduce_tests( + self.rank, self.world_size, num_inputs=num_inputs_per_rank + ) + for (inputs, outputs) in tests: + tensors = [fn(input) for input in inputs] + work = pg.allreduce(tensors) + work.wait() + self.assertEqual(tensors, outputs) + self.assertEqual(work.result(), outputs) + + def test_sparse_allreduce_basics(self): + self._test_sparse_allreduce_basics(lambda t: t) + + @skip_if_lt_x_gpu(2) + def test_sparse_allreduce_basics_cuda(self): + self._test_sparse_allreduce_basics(lambda t: t.clone().cuda()) + + def test_scatter_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1], dtype=torch.float32) + t2 = torch.zeros([1], dtype=torch.float64) + t3 = torch.zeros([2], dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.ScatterOptions() + opts.rootRank = -1 + pg.scatter([t1], [], opts) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.ScatterOptions() + opts.rootRank = self.world_size + pg.scatter([t1], [], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element output tensor list" + ): + opts = c10d.ScatterOptions() + opts.rootRank = 0 + pg.scatter([], [], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element output tensor list" + ): + opts = c10d.ScatterOptions() + opts.rootRank = 0 + pg.scatter([t1, t1], [], opts) + + with self.assertRaisesRegex(ValueError, "requires a single-element input list"): + opts = c10d.ScatterOptions() + opts.rootRank = self.rank + pg.scatter([t1], [], opts) + + with self.assertRaisesRegex(ValueError, "requires a single-element input list"): + opts = c10d.ScatterOptions() + opts.rootRank = self.rank + pg.scatter([t1], [[t1] * self.world_size, [t1] * self.world_size], opts) + + desired_list_size = self.world_size + incorrect_list_size = self.world_size - 1 + err_str = "Incorrect input list size {}. Input list size should be {}" + with self.assertRaisesRegex( + ValueError, err_str.format(incorrect_list_size, desired_list_size) + ): + opts = c10d.ScatterOptions() + opts.rootRank = self.rank + pg.scatter([t1], [[t1] * incorrect_list_size], opts) + + incorrect_list_size = self.world_size + 1 + with self.assertRaisesRegex( + ValueError, err_str.format(incorrect_list_size, desired_list_size) + ): + opts = c10d.ScatterOptions() + opts.rootRank = self.rank + pg.scatter([t1], [[t1] * incorrect_list_size], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor type"): + opts = c10d.ScatterOptions() + opts.rootRank = self.rank + pg.scatter([t1], [[t2] * self.world_size], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + opts = c10d.ScatterOptions() + opts.rootRank = self.rank + pg.scatter([t1], [[t3] * self.world_size], opts) + + with self.assertRaisesRegex(ValueError, "requires empty input on non-root"): + opts = c10d.ScatterOptions() + opts.rootRank = (self.rank + 1) % self.world_size + pg.scatter([t1], [[t1] * self.world_size], opts) + + def _test_scatter_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + # Preallocate tensors for input/output + input = [fn(torch.tensor([self.rank])) for _ in range(self.world_size)] + outputs = [fn(torch.tensor([-1])) for _ in range(self.world_size)] + + # Take turns being the scatter root and accumulate work items + work = [] + for i in range(self.world_size): + opts = c10d.ScatterOptions() + opts.rootRank = i + if i == self.rank: + work.append(pg.scatter([outputs[i]], [input], opts)) + else: + work.append(pg.scatter([outputs[i]], [], opts)) + + # Wait for work to complete + for i in range(self.world_size): + work[i].wait() + self.assertEqual(torch.tensor([i]), outputs[i]) + + def test_scatter_basics(self): + self._test_scatter_basics(lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_scatter_basics_cuda(self): + self._test_scatter_basics(lambda t: t.clone().cuda()) + + def _test_scatter_stress(self, inputs, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + outputs = [ + [fn(torch.tensor([-1])) for _ in range(self.world_size)] + for _ in range(len(inputs)) + ] + work_handles = [] + for i in range(len(inputs)): + for root in range(self.world_size): + opts = c10d.ScatterOptions() + opts.rootRank = root + if root == self.rank: + work = pg.scatter( + [outputs[i][root]], [[fn(e) for e in inputs[i]]], opts + ) + else: + work = pg.scatter([outputs[i][root]], [], opts) + work_handles.append(work) + + for i, work_handle in enumerate(work_handles): + work_handle.wait() + iter = i // self.world_size + root = i % self.world_size + + self.assertEqual( + torch.tensor([iter + root]), + outputs[iter][root], + msg=("Mismatch in iteration %d for rank %d" % (iter, root)), + ) + + def test_scatter_stress(self): + inputs = [ + [torch.tensor([i + self.rank]) for _ in range(self.world_size)] + for i in range(1000) + ] + self._test_scatter_stress(inputs, lambda t: t.clone()) + + @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/15963") + @skip_if_lt_x_gpu(2) + def test_scatter_stress_cuda(self): + inputs = [ + [torch.tensor([i + self.rank]) for _ in range(self.world_size)] + for i in range(1000) + ] + self._test_scatter_stress(inputs, lambda t: t.clone().cuda()) + + def test_gather_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1], dtype=torch.float32) + t2 = torch.zeros([1], dtype=torch.float64) + t3 = torch.zeros([2], dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.GatherOptions() + opts.rootRank = -1 + pg.gather([], [t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.GatherOptions() + opts.rootRank = self.world_size + pg.gather([], [t1], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element input tensor list" + ): + opts = c10d.GatherOptions() + opts.rootRank = 0 + pg.gather([], [], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element input tensor list" + ): + opts = c10d.GatherOptions() + opts.rootRank = 0 + pg.gather([], [t1, t1], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element output list" + ): + opts = c10d.GatherOptions() + opts.rootRank = self.rank + pg.gather([], [t1], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element output list" + ): + opts = c10d.GatherOptions() + opts.rootRank = self.rank + pg.gather([[t1] * self.world_size, [t1] * self.world_size], [t1], opts) + + desired_list_size = self.world_size + incorrect_list_size = self.world_size - 1 + err_str = "Incorrect output list size {}. Output list size should be {}" + with self.assertRaisesRegex( + ValueError, err_str.format(incorrect_list_size, desired_list_size) + ): + opts = c10d.GatherOptions() + opts.rootRank = self.rank + pg.gather([[t1] * incorrect_list_size], [t1], opts) + + incorrect_list_size = self.world_size + 1 + with self.assertRaisesRegex( + ValueError, err_str.format(incorrect_list_size, desired_list_size) + ): + opts = c10d.GatherOptions() + opts.rootRank = self.rank + pg.gather([[t1] * incorrect_list_size], [t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor type"): + opts = c10d.GatherOptions() + opts.rootRank = self.rank + pg.gather([[t2] * self.world_size], [t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + opts = c10d.GatherOptions() + opts.rootRank = self.rank + pg.gather([[t3] * self.world_size], [t1], opts) + + with self.assertRaisesRegex(ValueError, "requires empty output on non-root"): + opts = c10d.GatherOptions() + opts.rootRank = (self.rank + 1) % self.world_size + pg.gather([[t1] * self.world_size], [t1], opts) + + def _test_gather_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + # Preallocate tensors for input/output + input = [fn(torch.tensor([self.rank]))] + outputs = [fn(torch.tensor([-1])) for _ in range(self.world_size)] + + # Take turns being the gather root and accumulate work items + work = [] + for i in range(self.world_size): + opts = c10d.GatherOptions() + opts.rootRank = i + if i == self.rank: + work.append(pg.gather([outputs], input, opts)) + else: + work.append(pg.gather([], input, opts)) + + # Wait for work to complete + expected = [torch.tensor([rank]) for rank in range(self.world_size)] + for i in range(self.world_size): + work[i].wait() + if i == self.rank: + self.assertEqual(expected, outputs) + + def test_gather_basics(self): + self._test_gather_basics(lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_gather_basics_cuda(self): + self._test_gather_basics(lambda t: t.clone().cuda()) + + def _test_gather_stress(self, inputs, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + work_handles = [] + outputs = [ + [[fn(torch.tensor([-1])) for _ in range(self.world_size)]] + for _ in range(len(inputs)) + ] + expected_outputs = [ + [[torch.tensor([i + j]) for j in range(self.world_size)]] + for i in range(len(inputs)) + ] + for i in range(len(inputs)): + for root in range(self.world_size): + opts = c10d.GatherOptions() + opts.rootRank = root + if root == self.rank: + work = pg.gather(outputs[i], [fn(inputs[i])], opts) + else: + work = pg.gather([], [fn(inputs[i])], opts) + work_handles.append(work) + + for i, work_handle in enumerate(work_handles): + work_handle.wait() + iter = i // self.world_size + root = i % self.world_size + if root == self.rank: + self.assertEqual( + expected_outputs[iter], + outputs[iter], + msg=("Mismatch in iteration %d for root %d" % (iter, root)), + ) + + def test_gather_stress(self): + inputs = [torch.tensor([i + self.rank]) for i in range(1000)] + self._test_gather_stress(inputs, lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_gather_stress_cuda(self): + inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] + self._test_gather_stress(inputs, lambda t: t.clone().cuda()) + + def test_allgather_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1], dtype=torch.float32) + t2 = torch.zeros([1], dtype=torch.float64) + t3 = torch.zeros([2], dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "requires non-empty input tensor list"): + pg.allgather([], []) + + with self.assertRaisesRegex( + ValueError, "requires input/output tensor lists to have the same length" + ): + pg.allgather([], [t1]) + + with self.assertRaisesRegex( + ValueError, "requires input/output tensor lists to have the same length" + ): + pg.allgather([[t1] * self.world_size, [t1] * self.world_size], [t1]) + + with self.assertRaisesRegex(ValueError, "invalid output tensor list"): + pg.allgather([[t1] * (self.world_size - 1)], [t1]) + + with self.assertRaisesRegex(ValueError, "invalid output tensor list"): + pg.allgather([[t1] * (self.world_size + 1)], [t1]) + + with self.assertRaisesRegex(ValueError, "invalid tensor type"): + pg.allgather( + [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t2] + ) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + pg.allgather( + [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t3] + ) + + with self.assertRaisesRegex(ValueError, "invalid tensor type"): + pg.allgather([([t1, t2] * (self.world_size))[: self.world_size]], [t1]) + + with self.assertRaisesRegex(ValueError, "invalid tensor size"): + pg.allgather([([t1, t3] * (self.world_size))[: self.world_size]], [t1]) + + def _test_allgather_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + # Run with N input tensor per rank + for n in [1, 2, 3]: + input = [fn(torch.tensor([n * self.rank + i])) for i in range(n)] + output = [ + [fn(torch.tensor([-1])) for _ in range(n * self.world_size)] + for _ in range(n) + ] + expected_output = [ + [torch.tensor([i]) for i in range(n * self.world_size)] + for _ in range(n) + ] + work = pg.allgather(output, input) + work.wait() + self.assertEqual(expected_output, output) + + def test_allgather_basics(self): + self._test_allgather_basics(lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_allgather_basics_cuda(self): + self._test_allgather_basics(lambda t: t.clone().cuda()) + + def _test_allgather_stress(self, inputs, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + work_handles = [] + outputs = [ + [[fn(torch.tensor([-1])) for _ in range(self.world_size)]] + for _ in range(len(inputs)) + ] + expected_outputs = [ + [[torch.tensor([i + j]) for j in range(self.world_size)]] + for i in range(len(inputs)) + ] + for i in range(len(inputs)): + work = pg.allgather(outputs[i], [fn(inputs[i])]) + work_handles.append(work) + + for i, work_handle in enumerate(work_handles): + work_handle.wait() + self.assertEqual( + expected_outputs[i], + outputs[i], + msg=("Mismatch in iteration %d" % i), + ) + + def test_allgather_stress(self): + inputs = [torch.tensor([i + self.rank]) for i in range(1000)] + self._test_allgather_stress(inputs, lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_allgather_stress_cuda(self): + inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] + self._test_allgather_stress(inputs, lambda t: t.clone().cuda()) + + def test_allgather_coalesced_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + dummy_input = [torch.zeros([1], dtype=torch.float32)] + dummy_output_lists = [ + [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size) + ] + + # One of output tensors does not match input list. + dummy_output_lists[0] = [torch.zeros([0], dtype=torch.float32)] + with self.assertRaisesRegex( + ValueError, "invalid size of output tensor at index 0" + ): + c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) + + # One of output tensors does not match input list. + dummy_output_lists[0] = [torch.zeros([1], dtype=torch.float64)] + with self.assertRaisesRegex(ValueError, "invalid tensor type at index 0"): + c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) + + # Output lists have too many elements + dummy_output_lists = [ + [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size + 1) + ] + with self.assertRaisesRegex( + ValueError, "output lists should be equal to world size" + ): + c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) + + # Output is not a list of lists. + dummy_output_lists = [torch.zeros([0], dtype=torch.float32)] + with self.assertRaisesRegex( + RuntimeError, "Invalid function argument.*output_tensor_lists" + ): + c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) + + def test_reduce_checks(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + t1 = torch.zeros([1], dtype=torch.float32) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.ReduceOptions() + opts.rootRank = -1 + opts.rootTensor = 0 + pg.reduce([t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root rank"): + opts = c10d.ReduceOptions() + opts.rootRank = self.world_size + opts.rootTensor = 0 + pg.reduce([t1], opts) + + with self.assertRaisesRegex(ValueError, "invalid root tensor"): + opts = c10d.ReduceOptions() + opts.rootRank = self.rank + opts.rootTensor = 1 + pg.reduce([t1], opts) + + with self.assertRaisesRegex( + ValueError, "requires a single-element tensor list" + ): + opts = c10d.ReduceOptions() + opts.rootRank = self.rank + opts.rootTensor = 0 + pg.reduce([t1, t1], opts) + + def _test_reduce_basics(self, fn): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + for (op, input, output) in simple_reduce_tests(self.rank, self.world_size): + for root in range(self.world_size): + opts = c10d.ReduceOptions() + opts.reduceOp = op + opts.rootRank = root + tmp = fn(input) + work = pg.reduce([tmp], opts) + work.wait() + if root == self.rank: + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(output, tmp) + + def test_reduce_basics(self): + self._test_reduce_basics(lambda t: t.clone()) + + @skip_if_lt_x_gpu(2) + def test_reduce_basics_cuda(self): + self._test_reduce_basics(lambda t: t.clone().cuda()) + + def _test_reduce_stress(self, inputs): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, self.opts(threads=8) + ) + work_handles = [] + outputs = [] + for i in range(len(inputs)): + for root in range(self.world_size): + opts = c10d.ReduceOptions() + opts.rootRank = root + tmp = inputs[i].clone() + outputs.append(tmp) + work = pg.reduce([tmp], opts) + work_handles.append(work) + + for i, work_handle in enumerate(work_handles): + work_handle.wait() + iter = i // self.world_size + root = i % self.world_size + if root == self.rank: + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + torch.tensor( + [ + (iter * self.world_size) + + (self.world_size * (self.world_size - 1) / 2) + ] + ), + outputs[i], + msg=("Mismatch in iteration %d with root rank %d" % (iter, root)), + ) + + def test_reduce_stress(self): + inputs = [torch.tensor([i + self.rank]) for i in range(1000)] + self._test_reduce_stress(inputs) + + @skip_if_lt_x_gpu(2) + def test_reduce_stress_cuda(self): + inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] + self._test_reduce_stress(inputs) + + def test_send_recv_all_to_all(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + # Preallocate tensors for input/output + inputs = [torch.tensor([self.rank]) for _ in range(self.world_size)] + outputs = [torch.tensor([-1]) for _ in range(self.world_size)] + + # Issue sends + send_work = [] + for i in range(self.world_size): + if i == self.rank: + continue + send_work.append(pg.send([inputs[i]], i, 0)) + + # Issue recvs + recv_work = [] + for i in range(self.world_size): + if i == self.rank: + continue + recv_work.append(pg.recv([outputs[i]], i, 0)) + + # Wait for sends to complete + for work in send_work: + work.wait() + self.assertTrue(work.is_completed()) + + # Wait for recvs to complete + for work in recv_work: + work.wait() + self.assertTrue(work.is_completed()) + + # Test that every output other than our own contains the respective rank + for i in range(self.world_size): + if i == self.rank: + continue + self.assertEqual(torch.tensor([i]), outputs[i]) + + def test_barrier_implies_wait(self): + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + # Kick off allreduce operations + size = (100, 100) + num = 16 + tensors = [torch.full(size, float(i)) for i in range(num)] + for tensor in tensors: + # Note: leak the returned work handle + pg.allreduce(tensor) + + # Barrier should ensure all previous work has completed + pg.barrier().wait() + + for i, tensor in enumerate(tensors): + self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) + + @skip_if_win32() + def test_round_robin(self): + num_process_groups = 2 + store = c10d.FileStore(self.file_name, self.world_size) + pg = c10d._round_robin_process_groups( + [ + c10d.ProcessGroupGloo( + c10d.PrefixStore(str(i), store), self.rank, self.world_size, self.opts() + ) + for i in range(num_process_groups) + ] + ) + + # Run a few collectives so that we have called each process group + for _ in range(num_process_groups + 1): + tensor = torch.full([100, 100], float(self.rank)) + pg.broadcast(tensor, root=0).wait() + self.assertEqual(torch.full([100, 100], 0.0), tensor) + + @skip_if_win32() + def test_round_robin_create_destroy(self): + store = c10d.FileStore(self.file_name, self.world_size) + + def create(num, prefix): + return c10d._round_robin_process_groups( + [ + c10d.ProcessGroupGloo( + c10d.PrefixStore("%s/%d" % (prefix, i), store), + self.rank, + self.world_size, + self.opts() + ) + for i in range(num) + ] + ) + + # Run create/use/destroy twice + for i in range(2): + num_process_groups = 2 + pg = create(num=num_process_groups, prefix=i) + for _ in range(3): + tensor = torch.ones([10, 10]) + pg.allreduce(tensor).wait() + self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor) + del pg + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class DistributedDataParallelTest(test_c10d_common.AbstractDistributedDataParallelTest, MultiProcessTestCase): + + def setUp(self): + super(DistributedDataParallelTest, self).setUp() + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def _test_gloo_backend( + self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False + ): + store = c10d.FileStore(self.file_name, self.world_size) + options = c10d.ProcessGroupGloo._Options() + options._devices = [create_device(interface=LOOPBACK)] + process_group = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, options + ) + self._test_ddp_with_process_group( + process_group, devices, device_ids, multi_device, gradient_as_bucket_view + ) + + @requires_gloo() + def test_gloo_backend_cpu_module(self): + self._test_gloo_backend([torch.device("cpu")], None) + + @requires_gloo() + def test_gloo_backend_cpu_module_grad_is_view(self): + self._test_gloo_backend([torch.device("cpu")], None, gradient_as_bucket_view=True) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_gloo_backend_1gpu_module_device_ids_integer_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_gloo_backend(devices, int_devices) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_gloo_backend(devices, devices) + + @requires_gloo() + @skip_if_lt_x_gpu(4) + def test_gloo_backend_2gpu_module(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:2] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_gloo_backend(devices, None, multi_device=True) + + @requires_gloo() + @skip_if_lt_x_gpu(8) + def test_gloo_backend_4gpu_module(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:4] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_gloo_backend(devices, None, multi_device=True) + + def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False): + """ + By simulating a multi-task training, this test is to make sure: + 1) DDP does not touch the grad of globally unused parameters. + 2) DDP does update the grad of locally unused parameters. + """ + + class GlobalLocalUnusedParamModule(nn.Module): + def __init__(self): + super(GlobalLocalUnusedParamModule, self).__init__() + self.t0 = Task() + self.t1 = Task() + self.task_unused = Task() + + def task_parameters(self): + return (self.t0.p, self.t1.p, self.task_unused.p) + + def forward(self, x, rank): + return self.t0(x) if rank == 0 else self.t1(x) + + def run_and_verify_grad(model): + # Run forward + output = model(8, self.rank) + + # The grads of all parameters should be None at this point. + t0_p, t1_p, task_unused_p = model.module.task_parameters() + self.assertIsNone(t0_p.grad) + self.assertIsNone(t1_p.grad) + self.assertIsNone(task_unused_p.grad) + + # Run backward + output.mean().backward() + + # Now locally unused parameter should have grad updated on all ranks. + # However the globally unused parameter should still have None grad. + self.assertIsNotNone(t0_p.grad) + self.assertIsNotNone(t1_p.grad) + self.assertIsNone(task_unused_p.grad) + + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + # Test on CPU + cpu_model = DistributedDataParallel( + GlobalLocalUnusedParamModule().cpu(), + process_group=process_group, + find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + run_and_verify_grad(cpu_model) + + # Test on GPU + device_id = gpus_for_rank(self.world_size)[self.rank][0] + gpu_model = DistributedDataParallel( + GlobalLocalUnusedParamModule().to(device_id), + device_ids=[device_id], + process_group=process_group, + find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + run_and_verify_grad(gpu_model) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad(self): + self._test_global_local_unused_params_grad() + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad_with_grad_is_view(self): + self._test_global_local_unused_params_grad(gradient_as_bucket_view=True) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_find_unused_parameters_when_unused_parameters_empty(self): + """ + An empty unused_parameters array does not imply find_unused_parameters = + false. This test makes sure that DDP allreduces unused parameters + accordingly where the forward pass in some process uses all parameters. + This unit test creates a module that uses all parameters in rank = 0, and + has unused parameters in other ranks. + """ + + class FindUnusedParamModule(nn.Module): + def __init__(self): + super(FindUnusedParamModule, self).__init__() + self.t0 = Task() + self.t1 = Task() + + def task_parameters(self): + return (self.t0.p, self.t1.p) + + def forward(self, x, rank): + return self.t1(self.t0(x)) if rank == 0 else self.t1(x) + + def run_and_verify_grad(model): + # Run forward + output = model(8, self.rank) + + # The grads of all parameters should be None at this point. + [self.assertIsNone(t_p.grad) for t_p in model.module.task_parameters()] + + # Run backward + output.mean().backward() + + # Now locally unused parameter should have grad updated on all ranks. + [self.assertIsNotNone(t_p.grad) for t_p in model.module.task_parameters()] + + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + # Test on CPU + cpu_model = DistributedDataParallel( + FindUnusedParamModule().cpu(), + process_group=process_group, + find_unused_parameters=True, + ) + run_and_verify_grad(cpu_model) + + # Test on GPU + device_id = gpus_for_rank(self.world_size)[self.rank][0] + gpu_model = DistributedDataParallel( + FindUnusedParamModule().to(device_id), + device_ids=[device_id], + process_group=process_group, + find_unused_parameters=True, + ) + run_and_verify_grad(gpu_model) + + @requires_gloo() + def test_ignored_output(self): + """ + Test that the output of a model can be ignored and that there is no + implicit requirement that `backward` gets called. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + class IgnoredOutput(nn.Module): + def __init__(self): + super(IgnoredOutput, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + model = DistributedDataParallel( + IgnoredOutput().float(), + process_group=process_group, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) + + # Run a few iterations where we ignore the output. + for _ in range(4): + output = model(input) + del output + + # Run a few iterations where we use the output. + for _ in range(4): + output = model(input) + loss = criterion(output, target) + loss.backward() + + @requires_gloo() + def test_ignored_output_with_unused_parameters(self): + """ + Test that the output of a model can be ignored and that there is no + implicit requirement that `backward` gets called, if not all model + parameters participated in computing the model output. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + class IgnoredOutputWithUnusedParameters(nn.Module): + def __init__(self): + super(IgnoredOutputWithUnusedParameters, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.fc3 = nn.Linear(4, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + model = DistributedDataParallel( + IgnoredOutputWithUnusedParameters().float(), + process_group=process_group, + find_unused_parameters=True, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) + + # Run a few iterations where we ignore the output. + for _ in range(4): + output = model(input) + del output + + # Run a few iterations where we use the output. + for _ in range(4): + output = model(input) + loss = criterion(output, target) + loss.backward() + + def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): + mult = 2 + batch_size = mult * self.world_size + criterion = nn.CrossEntropyLoss() + input = torch.randint(0, 10, [batch_size, 2]) + target = torch.randint(0, 10, [batch_size]) + + # Run with entire batch against single process version + criterion(vanilla_model(input), target).backward() + + # Run with partial batch against multi process version + partial_input = input.split(mult)[self.rank] + partial_target = target.split(mult)[self.rank] + criterion(ddp_model(partial_input), partial_target).backward() + + # Check that the gradients are sparse and identical + vanilla_parameter = next(vanilla_model.parameters()) + ddp_parameter = next(ddp_model.parameters()) + self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_save_load_checkpoint(self): + dist.init_process_group( + "gloo", + init_method=f"file://{self.file_name}", + world_size=self.world_size, + rank=self.rank, + ) + + class TestModel(nn.Module): + def __init__(self): + super(TestModel, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + def train_loop(model, optimizer, iterations): + for _ in range(iterations): + optimizer.zero_grad() + output = model(input) + loss = criterion(output, target) + loss.backward() + optimizer.step() + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + + model_withload = TestModel().float().to(device_id) + model_withoutload = TestModel().float().to(device_id) + + ddp_withload = DistributedDataParallel( + model_withload, + device_ids=[device_id], + ) + ddp_withoutload = DistributedDataParallel( + model_withoutload, + device_ids=[device_id], + ) + + # ensure that all the three models start with the same set of parameters. By default they are randomized on construction + for p in ddp_withload.parameters(): + with torch.no_grad(): + p.zero_() + for p in model_withload.parameters(): + with torch.no_grad(): + p.zero_() + for p in ddp_withoutload.parameters(): + with torch.no_grad(): + p.zero_() + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + + optimizer_withload = torch.optim.SGD(ddp_withload.parameters(), lr=0.001) + optimizer_non_ddp_withload = torch.optim.SGD(model_withload.parameters(), lr=0.001) + optimizer_withoutload = torch.optim.SGD(ddp_withoutload.parameters(), lr=0.001) + + input = torch.rand([batch_size, 2], dtype=torch.float).to(device_id) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + # run the model for 6 iterations, with a checkpoint in the middle + train_loop(ddp_withload, optimizer_withload, 3) + + # zero out parameters of both DDP and non-DDP models and reload them from the DDP state dict + checkpoint_path = tempfile.gettempdir() + "/model.checkpoint" + if self.rank == 0: + torch.save(ddp_withload.state_dict(), checkpoint_path) + + dist.barrier() + map_location = {"cuda:%d" % 0: "cuda:%d" % self.rank} + ddp_state_dict = torch.load(checkpoint_path, map_location=map_location) + + for model in [ddp_withload, model_withload]: + for p in ddp_withload.parameters(): + with torch.no_grad(): + p.zero_() + ddp_withload.load_state_dict(ddp_state_dict) + # the non-DDP model needs to first remove the prefix of "module." from the DDP state dict + torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(ddp_state_dict, "module.") + model_withload.load_state_dict(ddp_state_dict) + + train_loop(ddp_withload, optimizer_withload, 3) + train_loop(model_withload, optimizer_non_ddp_withload, 3) + + # re-run the model with the same inputs for 6 iterations with no checkpoint + train_loop(ddp_withoutload, optimizer_withoutload, 6) + + for p_withload, p_withoutload, p_non_ddp_withload in zip( + ddp_withload.parameters(), ddp_withoutload.parameters(), model_withload.parameters() + ): + self.assertEqual(p_withload, p_withoutload) + self.assertEqual(p_non_ddp_withload, p_withoutload) + + def _test_sparse_gradients(self, gradient_as_bucket_view=False): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + # Ensure initialized weights and inputs are identical across processes + torch.manual_seed(1337) + + vanilla_model = SparseGradientModule() + ddp_model = DistributedDataParallel( + copy.deepcopy(vanilla_model), + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) + + @requires_gloo() + def test_sparse_gradients(self): + self._test_sparse_gradients() + + @requires_gloo() + def test_sparse_gradients_grad_is_view(self): + self._test_sparse_gradients(gradient_as_bucket_view=True) + + @requires_gloo() + def test_ddp_comm_hook_future_passing_cpu(self): + """ + This unit test verifies whether the Future object is passed properly. + The callback function creates a Future object and sets a value to it. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + # Test on CPU + cpu_model = DistributedDataParallel( + ModuleForDdpCommHook().cpu(), process_group=process_group + ) + + # Register DDP Communication Hook + cpu_model.register_comm_hook(None, self._simple_hook) + + # check whether the grads are equal to what then callback returns. + # without the comm_hook, result would be 0.25 * torch.ones(2, 2). + self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2)) + + def _gpu_model_with_ddp_comm_hook( + self, process_group, hook=None, gradient_as_bucket_view=False, state=None + ): + device_id = gpus_for_rank(self.world_size)[self.rank][0] + gpu_model = DistributedDataParallel( + ModuleForDdpCommHook().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + # Register a DDP communication hook if any. + if hook is not None: + gpu_model.register_comm_hook(state, hook) + + return gpu_model + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_future_passing_gpu_gloo(self): + """ + This unit test verifies whether the Future object is passed properly using gloo backend. + The hook callback function creates a Future object and sets a value to it. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + # Get GPU model with simple_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook) + + # check whether the grads are equal to what simple_hook's then callback returns. + # without the comm_hook, result would be 0.25 * torch.ones(2, 2). + self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) + + @requires_gloo() + def test_ddp_invalid_comm_hook_init(self): + """ + This unit test makes sure that register_comm_hook properly checks the format + of hook defined by user. The Python hook must be callable. This test also + checks whether bucket annotation checked properly if defined. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + model = DistributedDataParallel( + ModuleForDdpCommHook(), process_group=process_group + ) + + with self.assertRaisesRegex(TypeError, "Communication hook must be callable."): + model.register_comm_hook(state=None, hook=1) + + with self.assertRaisesRegex( + ValueError, "bucket annotation should be dist.GradBucket." + ): + def comm_hook(state: object, bucket: int) -> torch.futures.Future: + return torch.futures.Future() + + model.register_comm_hook(state=None, hook=comm_hook) + + @requires_gloo() + def test_ddp_invalid_comm_hook_return_type(self): + """ + This test checks whether return annotation checked properly if defined. It also + checks whether an internal error is thrown if return type is incorrect and user + hasn't specified any return type annotation. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + model = DistributedDataParallel( + ModuleForDdpCommHook(), process_group=process_group + ) + + with self.assertRaisesRegex( + ValueError, + "Communication hook: return annotation should be torch.futures.Future or torch._C.Future.", + ): + def comm_hook(state: object, bucket: dist.GradBucket) -> int: + return torch.futures.Future() + + model.register_comm_hook(state=None, hook=comm_hook) + + with self.assertRaisesRegex( + RuntimeError, + "callback must return a torch.futures.Future or torch._C.Future object, but got", + ): + def comm_hook(state: object, bucket: dist.GradBucket): + return 1 + + model.register_comm_hook(state=None, hook=comm_hook) + + # Run forward + output = model(8, self.rank) + + # Run backward + output.mean().backward() + + @requires_gloo() + def test_ddp_comm_hook_register_just_once(self): + """ + DDP communication hook can only be registered once. This test validates whether + the error is thrown properly when register_comm_hook is called more than once. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + model = DistributedDataParallel( + ModuleForDdpCommHook(), process_group=process_group + ) + + def dummy_hook(state, bucket): + fut = torch.futures.Future() + fut.set_result([bucket.get_tensor()]) + return fut + + model.register_comm_hook(None, dummy_hook) + + with self.assertRaisesRegex( + RuntimeError, + "register_comm_hook or register_builtin_comm_hook can only be called once.", + ): + model.register_comm_hook(None, dummy_hook) + + @requires_gloo() + def test_ddp_comm_hook_sparse_gradients(self): + """ + Runs "test_sparse_gradients" unit test with DDP communication hook. We define a + simple hook that does allreduce and works with gloo backend for this test. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) + + # Ensure initialized weights and inputs are identical across processes + torch.manual_seed(1337) + + vanilla_model = SparseGradientModule() + ddp_model = DistributedDataParallel( + copy.deepcopy(vanilla_model), + process_group=process_group, + ) + + # "get_future" API does not support gloo backend, see GH Issue #42048. + # Instead, we wait for an allreduce work, and write its result to a Future. + def allreduce_hook_gloo( + state: object, bucket: dist.GradBucket + ) -> torch.futures.Future: + # Prepare allreduced grad bucket tensors by running an async work. + work = process_group.allreduce([bucket.get_tensor()]) + work.wait() + + fut = torch.futures.Future() + fut.set_result([bucket.get_tensor() / self.world_size]) + return fut + + ddp_model.register_comm_hook(None, allreduce_hook_gloo) + + self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) + + +class ReducerModule(nn.Module): + def __init__(self): + super(ReducerModule, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.fc3 = nn.Linear(4, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x, use_fc3=True): + x = self.relu(self.fc1(x)).float() + x = self.relu(self.fc2(x)).float() + if use_fc3: + x = self.fc3(x).float() + return F.softmax(x, dim=1) + + +@requires_gloo() +class ReducerTest(TestCase): + def setUp(self): + self.file = tempfile.NamedTemporaryFile(delete=False) + self.store = c10d.FileStore(self.file.name, 1) + self.process_group = c10d.ProcessGroupGloo(self.store, 0, 1) + + def test_single_dtype_single_bucket(self): + model = ReducerModule() + parameters = list(model.parameters()) + buckets = [list(range(len(parameters)))] + dist.Reducer([parameters], buckets, self.process_group) + + def _create_mixed_precision_model(self): + model = ReducerModule() + model.float() + model.fc1.double() + return model + + def test_multi_dtype_single_bucket(self): + model = self._create_mixed_precision_model() + + # Raise if there are multiple types per bucket. + # In this case we create one bucket for all parameters. + with self.assertRaises(RuntimeError): + parameters = [list(model.parameters())] + buckets = [list(range(len(parameters[0])))] + dist.Reducer(parameters, buckets, self.process_group) + + def test_multi_dtype_multi_bucket(self): + model = self._create_mixed_precision_model() + parameters = [list(model.parameters())] + group_by_dtype = groupby( + range(len(parameters[0])), key=lambda i: parameters[0][i].dtype + ) + buckets = [list(indices) for _, indices in group_by_dtype] + dist.Reducer(parameters, buckets, self.process_group) + + def _create_reducer_for_models(self, models, find_unused_parameters=False): + parameters = [list(model.parameters()) for model in models] + group_by_dtype = groupby( + range(len(parameters[0])), key=lambda i: parameters[0][i].dtype + ) + buckets = [list(indices) for _, indices in group_by_dtype] + return dist.Reducer( + parameters, + buckets, + self.process_group, + find_unused_parameters=find_unused_parameters, + ) + + def test_reducer_no_multi_replicas(self): + num_replicas = 2 + models = [self._create_mixed_precision_model() for _ in range(num_replicas)] + with self.assertRaisesRegex( + RuntimeError, + "Expected exactly one model replica.", + ): + reducer = self._create_reducer_for_models(models) + + def test_forward_backward(self): + batch_size = 10 + model = self._create_mixed_precision_model() + reducer = self._create_reducer_for_models([model]) + reducer.prepare_for_forward() + loss = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.double) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) + output = loss(model(input), target) + reducer.prepare_for_backward(output) + output.backward() + + def test_forward_backward_unused_parameters(self): + batch_size = 10 + model = self._create_mixed_precision_model() + reducer = self._create_reducer_for_models([model], find_unused_parameters=True) + reducer.prepare_for_forward() + loss = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.double) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) + output = loss(model(input, use_fc3=False), target) + + # Check that the grad of fc3 is not set. + self.assertEqual(None, model.fc3.weight.grad) + + # Compute and accumulate gradients. + reducer.prepare_for_backward(output) + output.backward() + + # The reducer will have marked the grad of fc3 as ready, because + # it doesn't show up in the autograd graph of `output`. Since fc3.weight + # is considered being globally unused, it will be kept untouched as None. + self.assertEqual(None, model.fc3.weight.grad) + + def test_forward_backward_optimizer(self): + batch_size = 10 + model = self._create_mixed_precision_model() + reducer = self._create_reducer_for_models([model], find_unused_parameters=True) + reducer.prepare_for_forward() + loss = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters()) + for i in range(3): + input = torch.rand([batch_size, 2], dtype=torch.double) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]) + + # The `zero_grad` function calls `detach_` and `zero_` on the grad + # tensors of model parameters. If we tried to set the grad tensors + # to a view of the reducer's bucket tensors, this would blow up. + optimizer.zero_grad() + + # Unused parameter only in the first iteration. + output = loss(model(input, use_fc3=(i > 0)), target) + reducer.prepare_for_backward(output) + output.backward() + optimizer.step() + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + + def setUp(self): + super(CommTest, self).setUp() + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def tearDown(self): + super(CommTest, self).tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + def _test_broadcast_coalesced(self, process_group, device, root_rank): + half = torch.float16 + + # No support for float16 for CPU tensors + if device == torch.device("cpu"): + half = torch.float32 + + target = torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float64, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + + # The tensors to pass to broadcast are idential to the target + # only on the process that is the root of the broadcast. + if self.rank == root_rank: + tensors = list(tensor.clone() for tensor in target) + else: + tensors = list(torch.zeros_like(tensor) for tensor in target) + + if self.rank != root_rank: + self.assertNotEqual(tensors, target) + + c10d._broadcast_coalesced( + process_group, tensors, buffer_size=256, src=root_rank + ) + + if self.rank != root_rank: + self.assertEqual(tensors, target) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_broadcast_coalesced_gloo_cuda(self): + store = c10d.FileStore(self.file_name, self.world_size) + options = c10d.ProcessGroupGloo._Options() + options._devices = [create_device(interface=LOOPBACK)] + process_group = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, options + ) + device = torch.device("cuda:%d" % self.rank) + ranks = list(range(self.world_size)) + for root_rank in ranks: + self._test_broadcast_coalesced(process_group, device, root_rank) + + @requires_gloo() + def test_broadcast_coalesced_gloo_cpu(self): + store = c10d.FileStore(self.file_name, self.world_size) + options = c10d.ProcessGroupGloo._Options() + options._devices = [create_device(interface=LOOPBACK)] + process_group = c10d.ProcessGroupGloo( + store, self.rank, self.world_size, options + ) + device = torch.device("cpu") + ranks = list(range(self.world_size)) + for root_rank in ranks: + self._test_broadcast_coalesced(process_group, device, root_rank) + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_sequence_num_set_default_pg_gloo(self): + self._test_sequence_num_set_default_pg(backend="gloo") + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_sequence_num_set_gloo_new_group(self): + self._test_sequence_num_set_new_group(backend="gloo") + + @requires_gloo() + def test_gloo_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="gloo", rank=self.rank, world_size=self.world_size, store=store + ) + + with self.assertRaisesRegex(RuntimeError, "device_ids not supported"): + c10d.barrier(device_ids=[self.rank]) + + +if __name__ == "__main__": + assert ( + not torch.cuda._initialized + ), "test_distributed must not have initialized CUDA context on main process" + + run_tests() diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py new file mode 100644 index 00000000000..24010ebee02 --- /dev/null +++ b/test/distributed/test_c10d_nccl.py @@ -0,0 +1,2318 @@ +import copy +import math +import os +import random +import signal +import sys +import tempfile +import threading +import time +import unittest +from contextlib import contextmanager +from datetime import timedelta +from itertools import product +from unittest import mock + +import numpy + +import torch +import torch.distributed as c10d + +if not c10d.is_available(): + print("c10d not available, skipping tests", file=sys.stderr) + sys.exit(0) + +import torch.distributed as dist +import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default +import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD +import torch.nn.functional as F +import torch.testing._internal.common_utils as common +from torch import nn +from torch.nn.parallel import DistributedDataParallel +from torch.utils.checkpoint import checkpoint +from torch.testing._internal.common_distributed import ( + MultiProcessTestCase, + requires_nccl, + requires_nccl_version, + skip_if_lt_x_gpu, + get_timeout, + skip_if_rocm, + with_dist_debug_levels, + with_nccl_blocking_wait, +) +from torch.testing._internal.common_utils import ( + TestCase, + run_tests, + retry_on_connect_failures, + TEST_WITH_TSAN, +) +import test_c10d_common +from test_c10d_common import gpus_for_rank, DoubleGpuNet, ConvNet, ModuleForDdpCommHook + + +class RendezvousEnvTest(TestCase): + @retry_on_connect_failures + @requires_nccl() + def test_common_errors(self): + if torch.cuda.device_count() == 0: + raise unittest.SkipTest("No GPUs available, skipping test") + + vars = { + "WORLD_SIZE": "1", + "RANK": "0", + "MASTER_ADDR": "127.0.0.1", + "MASTER_PORT": str(common.find_free_port()), + } + + class Env(object): + def __init__(self, vars): + self.env_patcher = mock.patch.dict(os.environ, vars, clear=True) + + def __enter__(self): + self.env_patcher.start() + + def __exit__(self, type, value, traceback): + self.env_patcher.stop() + + def without(d, key): + d = d.copy() + d.pop(key) + return d + + def withouts(d, keys): + d = d.copy() + for key in keys: + d.pop(key) + return d + + with Env(without(vars, "WORLD_SIZE")): + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"): + gen = c10d.rendezvous("env://") + next(gen) + c10d.init_process_group(backend="nccl", world_size=1) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(without(vars, "RANK")): + self.assertEqual(None, os.environ.get("RANK")) + with self.assertRaisesRegex(ValueError, "RANK expected"): + gen = c10d.rendezvous("env://") + next(gen) + c10d.init_process_group(backend="nccl", rank=0) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): + self.assertEqual(None, os.environ.get("RANK")) + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + c10d.init_process_group(backend="nccl", rank=0, world_size=1) + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(vars): + c10d.init_process_group(backend="nccl") + self.assertEqual(c10d.get_rank(), 0) + self.assertEqual(c10d.get_world_size(), 1) + c10d.destroy_process_group() + + with Env(without(vars, "MASTER_ADDR")): + self.assertEqual(None, os.environ.get("MASTER_ADDR")) + with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"): + gen = c10d.rendezvous("env://") + next(gen) + + with Env(without(vars, "MASTER_PORT")): + self.assertEqual(None, os.environ.get("MASTER_PORT")) + with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"): + gen = c10d.rendezvous("env://") + next(gen) + + with Env(without(vars, "WORLD_SIZE")): + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + gen = c10d.rendezvous("env://?world_size={}".format(1)) + _, _, size = next(gen) + self.assertEqual(size, 1) + + with Env(without(vars, "RANK")): + self.assertEqual(None, os.environ.get("RANK")) + gen = c10d.rendezvous("env://?rank={}".format(0)) + _, rank, _ = next(gen) + self.assertEqual(rank, 0) + + with Env(withouts(vars, ["RANK", "WORLD_SIZE"])): + self.assertEqual(None, os.environ.get("RANK")) + self.assertEqual(None, os.environ.get("WORLD_SIZE")) + gen = c10d.rendezvous("env://?rank={}&world_size={}".format(0, 1)) + _, rank, size = next(gen) + self.assertEqual(rank, 0) + self.assertEqual(size, 1) + + +class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase): + @requires_nccl() + @retry_on_connect_failures + def test_default_store_timeout_nccl(self): + if torch.cuda.device_count() == 0: + raise unittest.SkipTest("No GPUs available, skipping test") + self._test_default_store_timeout("nccl") + + +class ProcessGroupNCCLNoGPUTest(TestCase): + MAIN_PROCESS_RANK = 0 + + def setUp(self): + self.rank = self.MAIN_PROCESS_RANK + self.world_size = 1 + self.file = tempfile.NamedTemporaryFile(delete=False) + self.num_gpus = torch.cuda.device_count() + if self.num_gpus > 0: + raise unittest.SkipTest("GPUs are available, skipping test") + + def tearDown(self): + pass + + @requires_nccl() + def test_init_no_gpus(self): + store = c10d.FileStore(self.file.name, self.world_size) + with self.assertRaisesRegex( + RuntimeError, "ProcessGroupNCCL is only supported with GPUs, no GPUs found!" + ): + c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class ProcessGroupNCCLTest(TestCase): + MAIN_PROCESS_RANK = 0 + + def setUp(self): + self.rank = self.MAIN_PROCESS_RANK + self.world_size = 1 + self.file = tempfile.NamedTemporaryFile(delete=False) + self.num_gpus = torch.cuda.device_count() + if self.num_gpus < 2: + raise unittest.SkipTest("NCCL test requires 2+ GPUs") + + # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests + # that use NCCL_BLOCKING_WAIT will test it as expected. + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + + def tearDown(self): + pass + + @requires_nccl() + def test_empty_tensors(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + xs = [torch.cuda.FloatTensor([])] + pg.broadcast(xs).wait() + self.assertEqual(0, xs[0].numel()) + + pg.allreduce(xs).wait() + self.assertEqual(0, xs[0].numel()) + + pg.reduce(xs).wait() + self.assertEqual(0, xs[0].numel()) + + ys = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]] + pg.allgather(ys, xs).wait() + for y in ys[0]: + self.assertEqual(0, y.numel()) + + ys = [torch.cuda.FloatTensor([])] + xs = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]] + pg.reduce_scatter(ys, xs).wait() + self.assertEqual(0, ys[0].numel()) + + @requires_nccl() + def test_broadcast_ops(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def broadcast(xs, rootRank, rootTensor): + opts = c10d.BroadcastOptions() + opts.rootRank = rootRank + opts.rootTensor = rootTensor + work = pg.broadcast(xs, opts) + work.wait() + + # for every root tensor + for rt in range(self.num_gpus): + tensors = [] + for i in range(self.num_gpus): + tensors.append(torch.tensor([i]).cuda(i)) + + broadcast(tensors, self.rank, rt) + + for i in range(self.num_gpus): + self.assertEqual(tensors[i], tensors[rt]) + + @requires_nccl() + def test_allreduce_ops(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def allreduce(tensors, op): + opts = c10d.AllreduceOptions() + opts.reduceOp = op + work = pg.allreduce(tensors, opts) + work.wait() + + # Sum + tensors = [] + for i in range(self.num_gpus): + tensors.append(torch.tensor([i + 1]).cuda(i)) + + allreduce(tensors, c10d.ReduceOp.SUM) + + for i in range(self.num_gpus): + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + torch.tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), + tensors[i], + ) + + # Product + tensors = [] + for i in range(self.num_gpus): + tensors.append(torch.tensor([i + 1]).cuda(i)) + + allreduce(tensors, c10d.ReduceOp.PRODUCT) + + for i in range(self.num_gpus): + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + torch.tensor([float(math.factorial(self.num_gpus))]), tensors[i] + ) + + # Min + tensors = [] + for i in range(self.num_gpus): + tensors.append(torch.tensor([i + 1]).cuda(i)) + + allreduce(tensors, c10d.ReduceOp.MIN) + + for i in range(self.num_gpus): + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(torch.tensor([1.0]), tensors[i]) + + # Max + tensors = [] + for i in range(self.num_gpus): + tensors.append(torch.tensor([i + 1]).cuda(i)) + + allreduce(tensors, c10d.ReduceOp.MAX) + + for i in range(self.num_gpus): + self.assertEqual(torch.tensor([self.num_gpus]), tensors[i]) + + for op in (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR): + with self.assertRaisesRegex( + RuntimeError, "Cannot use " + str(op) + " with NCCL" + ): + allreduce(tensors, op) + + @requires_nccl() + def test_reduce_ops(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def reduce(xs, rootRank, rootTensor, op=None): + opts = c10d.ReduceOptions() + opts.rootRank = rootRank + opts.rootTensor = rootTensor + if op: + opts.reduceOp = op + work = pg.reduce(xs, opts) + work.wait() + + # for every root tensor + for rt in range(self.num_gpus): + tensors = [] + for i in range(self.num_gpus): + tensors.append(torch.tensor([i + 1]).cuda(i)) + + reduce(tensors, self.rank, rt) + + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + torch.tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), + tensors[rt], + ) + + for op in (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR): + with self.assertRaisesRegex( + RuntimeError, "Cannot use " + str(op) + " with NCCL" + ): + reduce(tensors, self.rank, rt, op) + + @requires_nccl() + def test_allgather_ops(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def allgather(output_ts, input_ts): + work = pg.allgather(output_ts, input_ts) + work.wait() + + tensors = [] + output_ts = [[] for _ in range(self.num_gpus)] + + for idx, ls in enumerate(output_ts): + for _ in range(self.world_size * self.num_gpus): + ls.append(torch.tensor([0]).cuda(idx)) + + for i in range(self.num_gpus): + tensors.append(torch.tensor([i]).cuda(i)) + + allgather(output_ts, tensors) + + # Verification + for device_ts in output_ts: + for s_idx, t in enumerate(device_ts): + self.assertEqual(torch.tensor([s_idx]), t) + + @requires_nccl() + def test_reduce_scatter_ops(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def reduce_scatter(outputs, input_lists, op): + opts = c10d.ReduceScatterOptions() + opts.reduceOp = op + work = pg.reduce_scatter(outputs, input_lists, opts) + work.wait() + + virtual_rank = self.rank * self.world_size + virtual_world_size = self.num_gpus * self.world_size + + output = [torch.tensor([0]).cuda(i) for i in range(self.num_gpus)] + + # 0 1 2 + # 0 [0..11] [1..12] + # 1 [3..14] + # 2 + # 3 + + # Sum + tensor_lists = [ + [ + torch.tensor([self.rank * self.num_gpus + i + j]).cuda(i) + for j in range(virtual_world_size) + ] + for i in range(self.num_gpus) + ] + + reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM) + + for i in range(self.num_gpus): + expected = torch.tensor( + [ + float(self.num_gpus * (self.num_gpus - 1) / 2) + + (virtual_rank + i) * virtual_world_size + ] + ) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(expected, output[i]) + + # Min + reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN) + + for i in range(self.num_gpus): + expected = torch.tensor([self.rank * self.world_size + i]) + self.assertEqual(expected, output[i]) + + # Max + reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX) + + for i in range(self.num_gpus): + expected = torch.tensor( + [self.rank * self.world_size + i + virtual_world_size - 1] + ) + self.assertEqual(expected, output[i]) + + # Product + tensor_lists = [ + [ + torch.tensor( + [(self.rank * self.num_gpus + i + j) % virtual_world_size + 1] + ).cuda(i) + for j in range(virtual_world_size) + ] + for i in range(self.num_gpus) + ] + + reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT) + + for i in range(self.num_gpus): + expected = torch.tensor([float(math.factorial(virtual_world_size))]) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(expected, output[i]) + + @requires_nccl() + def test_barrier(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def allreduce(tensors): + opts = c10d.AllreduceOptions() + work = pg.allreduce(tensors, opts) + return work + + # Making the collective to operate on + # 1, 2, 3, 4, .... self.num_gpus GPUs + tensors_list = [[] for _ in range(2, self.num_gpus + 1)] + for i in range(2, self.num_gpus + 1): + for j in range(i): + tensors_list[i - 2].append(torch.tensor([j + 1]).cuda(j)) + + works = [] + for tensors in tensors_list: + work = allreduce(tensors) + works.append(work) + + # Barrier will ensure that all previous work is completed + pg.barrier().wait() + + for i in range(2, self.num_gpus + 1): + for j in range(i): + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + torch.tensor([float(i * (i + 1) / 2)]), tensors_list[i - 2][j] + ) + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class DistributedDataParallelTest(test_c10d_common.AbstractDistributedDataParallelTest, MultiProcessTestCase): + + def setUp(self): + super(DistributedDataParallelTest, self).setUp() + # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests + # that use NCCL_BLOCKING_WAIT will test it as expected. + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def _test_nccl_backend( + self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False + ): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + self._test_ddp_with_process_group( + process_group, devices, device_ids, multi_device, gradient_as_bucket_view + ) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_backend_multi_device_ids_not_allowed(self): + int_devices = list(range(torch.cuda.device_count())) + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + with self.assertRaisesRegex(ValueError, "device_ids can only be None or contain a single element."): + self._test_nccl_backend(devices, int_devices) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_backend_single_device_module_device_ids_None(self): + self._test_nccl_backend(None, None) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_backend_single_device_module_empty_device_ids(self): + # This tests the backward compatibility of accepting an empty list as `device_ids`, + # although we no longer document this in favor of the default value of `None`, + # which is consistent with multi-device modules and CPU modules. + self._test_nccl_backend(None, []) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_nccl_backend_multi_device_module_device_ids_None(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:2] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_nccl_backend(devices, None, multi_device=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_backend_1gpu_module_device_ids_integer_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_nccl_backend(devices, int_devices) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_nccl_backend(devices, devices) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_nccl_backend_2gpu_module(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:2] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_nccl_backend(devices, None, multi_device=True) + + @requires_nccl() + @skip_if_lt_x_gpu(8) + def test_nccl_backend_4gpu_module(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:4] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + self._test_nccl_backend(devices, None, multi_device=True) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_ddp_multi_device_module_config(self): + gpus = gpus_for_rank(self.world_size)[self.rank] + + self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process") + + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + gpus = gpus[:2] + model = DoubleGpuNet(gpus) + + with self.assertRaisesRegex( + ValueError, + "DistributedDataParallel device_ids and output_device arguments only work with " + "single-device/multiple-device GPU modules or CPU modules", + ): + ddp_model = DistributedDataParallel( + model, output_device=gpus[1], process_group=process_group + ) + + with self.assertRaisesRegex(ValueError, "device_ids can only be None or contain a single element."): + ddp_model = DistributedDataParallel( + model, device_ids=gpus, process_group=process_group + ) + + with self.assertRaisesRegex( + ValueError, "input module must be on the same type of devices" + ): + model.fc1 = model.fc1.cpu() + ddp_model = DistributedDataParallel(model, process_group=process_group) + + model = model.cpu() + with self.assertRaisesRegex(ValueError, "device_ids can only be None or contain a single element."): + ddp_model = DistributedDataParallel( + model, device_ids=gpus, process_group=process_group + ) + + def _test_fp16(self, gradient_as_bucket_view=False): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + gpus = gpus_for_rank(self.world_size)[self.rank] + model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half() + nn.init.constant_(model.weight, 1) + ddp_model = DistributedDataParallel( + model, + device_ids=[gpus[0]], + process_group=process_group, + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + # Input 2**15, so that the gradients will overflow with a + # world_size of 2, unless we normalize the gradient by the + # world_size before the reduction + input = torch.tensor([[2 ** 15]]).cuda(gpus[0]).half() + + # Step model + ddp_model.train() + output = ddp_model(input) + loss = output.sum() + loss.backward() + + self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters())) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_fp16(self): + self._test_fp16() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_fp16_grad_is_view(self): + self._test_fp16(gradient_as_bucket_view=True) + + def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + class ForwardReturnValueModule(nn.Module): + def __init__(self): + super(ForwardReturnValueModule, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.fc3 = nn.Linear(4, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x, fn): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + # The first softmax does NOT include fc3 in its autograd graph + # whereas the second softmax DOES. If we pass only the first + # tensor we see in the output to the reducer, it marks the + # gradient for fc3 as ready (because it doesn't show up). If + # downstream uses of this return value choose to differentiate + # against the second output tensor, it would still receive a + # gradient and a callback for this tensor, resulting in a crash. + return fn( + F.softmax(x, dim=1), + F.softmax(self.fc3(x), dim=1), + ) + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + model = DistributedDataParallel( + ForwardReturnValueModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + # Always run "backward" to ensure the reducer is called by autograd. + # If we don't correctly capture the output tensors from the return value, + # the reducer won't see a hook for the unused parameter, and throw an error. + # The correct capture is what we're testing in this function. + def test(box, unbox): + output = model(input, fn=box) + loss = criterion(unbox(output), target) + loss.backward() + + # Test with identity return value + test( + box=lambda x, y: (x, y), + unbox=lambda obj: obj[1], + ) + + # Test with list return value + test( + box=lambda x, y: ["foo", x, "bar", y], + unbox=lambda obj: obj[3], + ) + + # Test with tuple return value + test( + box=lambda x, y: ("foo", x, "bar", y), + unbox=lambda obj: obj[3], + ) + + # Test with dict return value + test( + box=lambda x, y: {"foo": "bar", "a": x, "b": y}, + unbox=lambda obj: obj["b"], + ) + + # Test with list with dict return value + test( + box=lambda x, y: ["foo", "bar", {"a": x, "b": y}], + unbox=lambda obj: obj[2]["b"], + ) + + # Test with dict with list return value + test( + box=lambda x, y: {"foo": "bar", "list": [0, x, 1, y]}, + unbox=lambda obj: obj["list"][3], + ) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_arbitrary_forward_return_value(self): + self._test_arbitrary_forward_return_value() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_arbitrary_forward_return_value_grad_is_view(self): + self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_with_lazy_parameters(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + with self.assertRaisesRegex( + RuntimeError, "Modules with uninitialized parameters" + ): + DistributedDataParallel( + torch.nn.LazyLinear(10), process_group=process_group + ) + + def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + torch.cuda.set_device(self.rank) + dist.init_process_group( + backend="nccl", + world_size=self.world_size, + rank=self.rank, + init_method=f"file://{self.file_name}" + ) + process_group = c10d.distributed_c10d._get_default_group() + + class FindUnusedParametersModule(nn.Module): + def __init__(self): + super(FindUnusedParametersModule, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.fc3 = nn.Linear(4, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + # Return the fc3 module so that the caller can invoke it + # outside of the forward function. While this is bad practice, + # we can use it to trigger a reducer error. + return (F.softmax(x, dim=1), self.fc3) + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + ddp_model = None + + def test_find_unused_parameters( + find_unused_parameters, test_default=False, gradient_as_bucket_view=False + ): + if test_default: + model = DistributedDataParallel( + FindUnusedParametersModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + else: + model = DistributedDataParallel( + FindUnusedParametersModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + find_unused_parameters=find_unused_parameters, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + nonlocal ddp_model + ddp_model = model + + output, fc3 = model(input) + output = fc3(output) + loss = criterion(output, target) + loss.backward() + + # First test that finding unused params under these conditions is to + # trigger an error when `backward` is called (because fc3 is an unused + # parameter and will therefore be marked ready twice). + try: + test_find_unused_parameters( + True, gradient_as_bucket_view=gradient_as_bucket_view + ) + except Exception as ex: + self.assertTrue( + str(ex).startswith( + "Expected to mark a variable ready only once.", + ) + ) + unused_index = 2 + unused_index_str = f"Parameter at index {unused_index}" + model = ddp_model.module + for module_name, module in model.named_modules(): + if module == model.fc3: + for parameter_name, _ in module.named_parameters( + recurse=False + ): + unused_fqn = f"{module_name}.{parameter_name}" + # Only one such parameter in model.fc3, since bias=False + break + + if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF: + unused_index_str += f" with name {unused_fqn}" + + self.assertTrue(unused_index_str in str(ex)) + else: + self.fail("Expected exception") + + dist.barrier(process_group) + + # Then test that the default behavior can be overridden by setting + # `find_unused_parameters=False`. + try: + test_find_unused_parameters( + False, gradient_as_bucket_view=gradient_as_bucket_view + ) + except Exception as ex: + self.fail("Unexpected exception: %s" % ex) + + # Test find_unused_parameters defaults to False + try: + test_find_unused_parameters( + True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view + ) + except Exception as ex: + self.fail("Unexpected exception: %s" % ex) + + # TODO: Combine the following tests once https://github.com/pytorch/pytorch/issues/55967 + # is resolved. + @requires_nccl() + @skip_if_lt_x_gpu(2) + @with_dist_debug_levels(levels=["DETAIL"]) + def test_find_unused_parameters_kwarg_debug_detail(self): + self._test_find_unused_parameters_kwarg() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @with_dist_debug_levels(levels=["INFO"]) + def test_find_unused_parameters_kwarg_debug_info(self): + self._test_find_unused_parameters_kwarg() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @with_dist_debug_levels(levels=["OFF"]) + def test_find_unused_parameters_kwarg_debug_off(self): + self._test_find_unused_parameters_kwarg() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @with_dist_debug_levels(levels=["DETAIL"]) + def test_find_unused_parameters_kwarg_grad_is_view_debug_detail(self): + self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @with_dist_debug_levels(levels=["INFO"]) + def test_find_unused_parameters_kwarg_grad_is_view_debug_info(self): + self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @with_dist_debug_levels(levels=["OFF"]) + def test_find_unused_parameters_kwarg_grad_is_view_debug_off(self): + self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) + + def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + class MultipleOutputModule(nn.Module): + def __init__(self): + super(MultipleOutputModule, self).__init__() + + def define_module(): + return nn.Sequential( + nn.Linear(2, 10, bias=False), + nn.ReLU(), + nn.Linear(10, 4, bias=False), + nn.ReLU(), + ) + + self.module0 = define_module() + self.module1 = define_module() + + def forward(self, x): + return ( + F.softmax(self.module0(x), dim=1), + F.softmax(self.module1(x), dim=1), + ) + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + model = DistributedDataParallel( + MultipleOutputModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + # Compute loss and gradients for both outputs + output1, output2 = model(input) + loss1 = criterion(output1, target) + loss1.backward() + loss2 = criterion(output2, target) + loss2.backward() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_multiple_outputs_multiple_backward(self): + self._test_multiple_outputs_multiple_backward() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_multiple_outputs_multiple_backward_grad_is_view(self): + self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_no_grad(self): + """ + Note: this test can be sped up by only running it on a CPU module + once DistributedDataParallel supports them. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + class NoGradModule(nn.Module): + def __init__(self): + super(NoGradModule, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + model = DistributedDataParallel( + NoGradModule().float().to(device_id), + device_ids=[device_id], + process_group=process_group, + ) + + batch_size = 4 + input = torch.rand([batch_size, 2], dtype=torch.float) + + def check_no_grads(): + for p in model.parameters(): + self.assertTrue(p.requires_grad) + self.assertIsNone(p.grad) + + # After initialization, no parameter has their gradient set. + check_no_grads() + + # Run `forward` function with torch.no_grad() + with torch.no_grad(): + output = model(input) + self.assertTrue(isinstance(output, torch.Tensor)) + + # No parameter should have their gradient set. + check_no_grads() + + def _test_accumulate_gradients_no_sync( + self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False + ): + """ + This is the recommended way to implement accumulate grads. + If ``ddp_comm_hook`` input was specified, it will also register that hook + to the ``ddp_model``. The hook fed into this function should not change + the resulting gradients. + """ + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + global_batch_size = self.world_size + local_batch_size = len(devices) + + model, ddp_model, input, target = self._prepare_single_device_module( + process_group, devices, devices, global_batch_size, gradient_as_bucket_view + ) + + if ddp_comm_hook is not None: + ddp_model.register_comm_hook(process_group, ddp_comm_hook) + + def step_model(model, input, target): + model.train() + output = model(input) + loss = F.mse_loss(output, target.to(output.device)) + loss.backward() + + # ensure accumulate grads works with no_grad + with torch.no_grad(): + with ddp_model.no_sync(): + ddp_model.train() + ddp_model(input) + + # check two model parameters over num_iters iterations + for iteration in range(num_iters): + # single cpu/gpu training + step_model(model, input, target) + + ddp_input = input[ + self.rank * local_batch_size: (self.rank + 1) * local_batch_size + ] + ddp_target = target[ + self.rank * local_batch_size: (self.rank + 1) * local_batch_size + ] + + if iteration % num_iters == 0: + # accumulate grads locally + with ddp_model.no_sync(): + step_model(ddp_model, ddp_input, ddp_target) + else: + # sync grads + step_model(ddp_model, ddp_input, ddp_target) + + for i, j in zip(model.parameters(), ddp_model.parameters()): + if iteration % num_iters == 0: + self.assertNotEqual(i.grad, j.grad) + else: + self.assertEqual(i.grad, j.grad) + + # Shuffle the input so that DDP input is different + torch.manual_seed(1337 + iteration) + input = input[torch.randperm(global_batch_size)] + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_accumulate_gradients_no_sync(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs + """ + self._test_accumulate_gradients_no_sync() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_accumulate_gradients_no_sync_grad_is_view(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs + """ + self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_accumulate_gradients_no_sync_allreduce_hook(self): + """ + Runs multiple iterations on _test_accumulate_gradients_no_sync + using allreduce hook and validates whether future result was properly + passed as gradients in reducer. + """ + + def allreduce_hook( + process_group: object, bucket: dist.GradBucket + ) -> torch._C.Future: + tensors = [bucket.get_tensor() / self.world_size] + return process_group.allreduce(tensors).get_future() + + self._test_accumulate_gradients_no_sync( + num_iters=4, ddp_comm_hook=allreduce_hook + ) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self): + """ + Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce + hook that also uses then callbacks. In first then callback result is multiplied + by 2, and the second callback divides the result by 2 * world_size. It validates + whether final result was properly passed as gradients in reducer. + """ + + def allreduce_with_then_hook( + process_group: object, bucket: dist.GradBucket + ) -> torch.futures.Future: + fut = process_group.allreduce([bucket.get_tensor()]).get_future() + + def mult(fut): + # Multiply the result by 2. + return [2 * t for t in fut.wait()] + + def div(fut): + # Divide the result by 2 * world_size. + return [t / (2 * self.world_size) for t in fut.wait()] + + return fut.then(mult).then(div) + + self._test_accumulate_gradients_no_sync( + num_iters=4, ddp_comm_hook=allreduce_with_then_hook + ) + + def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False): + # This is NOT the recommended way to implement accumulating grads, but + # we would like to make sure DDP does not mess up with the underlying + # module. + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = [torch.device("cuda:" + str(i)) for i in int_devices] + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + global_batch_size = self.world_size + + model, ddp_model, input, target = self._prepare_single_device_module( + process_group, devices, devices, global_batch_size, gradient_as_bucket_view + ) + + def step_model(model, input, target): + model.train() + output = model(input) + loss = F.mse_loss(output, target.to(output.device)) + loss.backward() + + # ensure accumulate grads works with no_grad + with torch.no_grad(): + ddp_model.train() + ddp_model.module(input) + + # Check two model parameters over 4 iterations. + # Use 4 iterations because we alternate between reducing and + # not reducing and want to make sure we switch both ways. + for iteration in range(4): + step_model(model, input, target) + + if iteration % 2 == 0: + # Skip gradients sync without calling prepare_for_backward + step_model( + ddp_model.module, + input[self.rank: (self.rank + 1)], + target[self.rank: (self.rank + 1)], + ) + for i, j in zip(model.parameters(), ddp_model.parameters()): + self.assertNotEqual(i.grad, j.grad) + else: + step_model( + ddp_model, + input[self.rank: (self.rank + 1)], + target[self.rank: (self.rank + 1)], + ) + for i, j in zip(model.parameters(), ddp_model.parameters()): + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(i.grad, j.grad) + + # Shuffle the input so that DDP input is different + torch.manual_seed(1337 + iteration) + input = input[torch.randperm(global_batch_size)] + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_accumulate_gradients_module(self): + self._test_accumulate_gradients_module() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_accumulate_gradients_module_with_grad_is_view(self): + self._test_accumulate_gradients_module(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_failure_recovery(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + # need to create a separate file for the recovered FileStore, because + # the original one will be deleted when destructing the first FileStore. + recovery_filename = self.file_name + "_recovery" + + if self.rank == 0: + # the file will be deleted by the recovered FileStore + open(recovery_filename, "w").close() + + # not necessary to run barrier here, as DDP will synchronize + + class TestModel(nn.Module): + def __init__(self): + super(TestModel, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + model = TestModel().float().to(device_id) + ddp = DistributedDataParallel( + model, + device_ids=[device_id], + process_group=process_group, + ) + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + + for _ in range(6): + output = ddp(input) + loss = criterion(output, target) + loss.backward() + + del ddp + del process_group + del store # this will delete self.file_name + + store = c10d.FileStore(recovery_filename, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + ddp = DistributedDataParallel( + model, + device_ids=[device_id], + process_group=process_group, + ) + + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to( + device_id + ) + for _ in range(6): + output = ddp(input) + loss = criterion(output, target) + loss.backward() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_pass_default_pg(self): + dist.init_process_group( + "nccl", + init_method=f"file://{self.file_name}", + world_size=self.world_size, + rank=self.rank, + ) + + default_pg = c10d.distributed_c10d._get_default_group() + dist.destroy_process_group(default_pg) + self.assertFalse(dist.is_initialized()) + + def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + global_batch_size = local_batch_size * self.world_size + + # Carry out some trials with small buckets and some with big buckets. + bucketsizes = (0.000001, 25) + # Tuples of lists. Each list describes per-layer characteristics for one trial. + layer_formats = ( + [torch.contiguous_format] * 4, + [torch.channels_last] * 2 + [torch.contiguous_format] * 2, + [torch.channels_last] * 4, + ) + layer_dtypes = ( + [torch.float] * 4, + [torch.float] * 2 + [torch.half] * 2, + [torch.half] * 4, + ) + + input_dev = layer_devs[0] if isinstance(layer_devs, list) else layer_devs + target_dev = layer_devs[-1] if isinstance(layer_devs, list) else layer_devs + input = torch.randn( + (global_batch_size, 8, 8, 8), device=input_dev, dtype=torch.float + ) + target = torch.randn( + (global_batch_size, 8, 4, 4), device=target_dev, dtype=torch.float + ) + local_batch_start = self.rank * local_batch_size + local_batch_end = (self.rank + 1) * local_batch_size + + # Reducer.cpp sneakily creates one "initial bucket" that ignores the "bucket_cap_mb" + # argument. The following makes sure the initial bucket also complies. + @contextmanager + def first_bucket_size(ddp_bucket_mb): + old_DEFAULT_FIRST_BUCKET_BYTES = dist._DEFAULT_FIRST_BUCKET_BYTES + dist._DEFAULT_FIRST_BUCKET_BYTES = int(ddp_bucket_mb * 1.0e6) + try: + yield + finally: + dist._DEFAULT_FIRST_BUCKET_BYTES = old_DEFAULT_FIRST_BUCKET_BYTES + + with torch.backends.cudnn.flags( + enabled=True, deterministic=True, benchmark=False + ): + for formats, dtypes, bucketsize in product( + layer_formats, layer_dtypes, bucketsizes + ): + with first_bucket_size(bucketsize): + model_msg = ( + "rank = {} formats = {} dtypes = {} bucketsize = {} ".format( + self.rank, formats, dtypes, bucketsize + ) + ) + try: + m = ConvNet(layer_devs, formats, dtypes) + m_ddp = DistributedDataParallel( + copy.deepcopy(m), + device_ids=replica_devices, + process_group=process_group, + bucket_cap_mb=bucketsize, + ) + opt = torch.optim.SGD(m.parameters(), lr=0.1) + opt_ddp = torch.optim.SGD(m_ddp.parameters(), lr=0.1) + has_half = any(p.dtype is torch.half for p in m.parameters()) + tol = 1.0e-3 if has_half else 1.0e-5 + except BaseException: + # Prints case-specific debugging info to narrow down failing case. + print( + "Caught exception during model creation for " + model_msg, + flush=True, + ) + raise + # 3 iters: First iter creates grads, second iter retests after rebucketing, + # third iter tries zeroed grads. + for it in range(3): + iter_msg = "iter = {} ".format(it) + model_msg + named_msg = iter_msg + try: + F.mse_loss(m(input).float(), target).backward() + F.mse_loss( + m_ddp(input[local_batch_start:local_batch_end]).float(), + target[local_batch_start:local_batch_end], + ).backward() + for i, ((layer_name, m_child), m_ddp_child) in enumerate( + zip(m.named_children(), m_ddp.module.children()) + ): + named_msg = layer_name + ".weight" + " " + iter_msg + self.assertTrue( + m_child.weight.grad.is_contiguous( + memory_format=formats[i] + ), + named_msg, + ) + self.assertTrue( + m_ddp_child.weight.grad.is_contiguous( + memory_format=formats[i] + ), + named_msg, + ) + for j, ((param_name, p), p_ddp) in enumerate( + zip( + m_child.named_parameters(), + m_ddp_child.parameters(), + ) + ): + named_msg = ( + layer_name + "." + param_name + " " + iter_msg + ) + self.assertEqual( + p.grad, p_ddp.grad, rtol=tol, atol=tol + ) + opt.step() + opt_ddp.step() + if it == 0: + for p, p_ddp in zip(m.parameters(), m_ddp.parameters()): + p.grad = None + p_ddp.grad = None + else: + m.zero_grad() + m_ddp.zero_grad() + except BaseException: + # Makes sure we still get info if an error occurred somewhere other than the asserts. + print( + "Caught exception during iterations at " + named_msg, + flush=True, + ) + raise + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_grad_layout_1devicemodule_1replicaperprocess(self): + dev0 = torch.device("cuda:" + str(gpus_for_rank(self.world_size)[self.rank][0])) + # Tells DDP to use just one device. + replica_devices = [dev0] + # Tells _test_grad_layout to construct ConvNet with all layers on this process's first assigned device. + layer_devs = dev0 + local_batch_size = 8 + self._test_grad_layout(replica_devices, layer_devs, local_batch_size) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + @skip_if_rocm + def test_grad_layout_2devicemodule(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:2] + dev0 = torch.device("cuda:" + str(int_devices[0])) + dev1 = torch.device("cuda:" + str(int_devices[1])) + # DDP's default behavior for a multi-device module is "don't replicate." + replica_devices = None + # Tells _test_grad_layout to constructs this process's ConvNet on 2 devices, with 2 layers on each device. + layer_devs = [dev0] * 2 + [dev1] * 2 + local_batch_size = 8 + self._test_grad_layout(replica_devices, layer_devs, local_batch_size) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_param_layout_mismatch_error(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + dev0 = torch.device("cuda:" + str(gpus_for_rank(self.world_size)[self.rank][0])) + layer_devs = dev0 + layer_formats = ( + [torch.contiguous_format] * 4 + if self.rank == 0 + else [torch.channels_last] * 4 + ) + layer_dtypes = [torch.float] * 4 + + m = ConvNet(layer_devs, layer_formats, layer_dtypes) + if self.rank == 0: + m_ddp = DistributedDataParallel( + m, device_ids=[dev0], process_group=process_group + ) + else: + with self.assertRaisesRegex( + RuntimeError, + ".* appears not to match strides of the same param in process 0", + ): + m_ddp = DistributedDataParallel( + m, device_ids=[dev0], process_group=process_group + ) + + def _gpu_model_with_ddp_comm_hook( + self, process_group, hook=None, gradient_as_bucket_view=False, state=None + ): + device_id = gpus_for_rank(self.world_size)[self.rank][0] + gpu_model = DistributedDataParallel( + ModuleForDdpCommHook().to(device_id), + device_ids=[device_id], + process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, + ) + + # Register a DDP communication hook if any. + if hook is not None: + gpu_model.register_comm_hook(state, hook) + + return gpu_model + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_future_passing_gpu_nccl(self): + """ + This unit test verifies whether the Future object is passed properly using nccl backend. + The hook callback function creates a Future object and sets a value to it. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + # Get GPU model with simple_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook) + + # check whether the grads are equal to what simple_hook's then callback returns. + # without the comm_hook, result would be 0.25 * torch.ones(2, 2). + self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) + + def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether a DDP communication hook that just calls + allreduce gives the same result with the case of no hook registered. + Without the then callback, the future_value in reducer is no longer + a PyObject, and this unit test verifies future_value is properly checked. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def allreduce_hook(state: object, bucket: dist.GradBucket) -> torch._C.Future: + tensors = [bucket.get_tensor() / self.world_size] + return process_group.allreduce(tensors).get_future() + + # Get GPU model with allreduce_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, allreduce_hook, gradient_as_bucket_view + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_default_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether default Python DDP communication hooks ALLREDUCE and FP16_COMPRESS + can give the same result with the case of no hook registered. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + # For these default DDP comm hooks, the only state is process group. + state = process_group + for hook in [default.allreduce_hook, default.fp16_compress_hook]: + # Get GPU model with the hook registered. + # The first arg 'process_group' is used for initializing the test environment, + # so it cannot be replaced by 'state', although they have the same value. + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, hook, gradient_as_bucket_view, state + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with + the FP16_WRAPPER can give the same result as when there is no hook registered. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + powerSGD_state = powerSGD.PowerSGDState(process_group=process_group) + + hook_args = [(powerSGD.powerSGD_hook, powerSGD_state), (default.allreduce_hook, process_group)] + + for hook, state in hook_args: + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, + default.fp16_compress_wrapper(hook), + gradient_as_bucket_view, + state + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_powerSGD_ddp_comm_hook_nccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether Python DDP communication hook POWER_SGD + can give the same result with the case of no hook registered. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + # Get GPU model with the hook registered. + # Test the hook with different algorithmic configs. + for use_error_feedback, warm_start in product([True, False], [True, False]): + state = powerSGD.PowerSGDState( + process_group=process_group, + matrix_approximation_rank=1, + use_error_feedback=use_error_feedback, + warm_start=warm_start, + ) + for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]: + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, hook, gradient_as_bucket_view, state + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + def _test_builtin_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False): + """ + This unit test verifies whether built-in C++ DDP communication hooks ALLREDUCE and FP16_COMPRESS + can give the same result with the case of no hook registered. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + for comm_hook_type in [ + dist.BuiltinCommHookType.ALLREDUCE, + dist.BuiltinCommHookType.FP16_COMPRESS, + ]: + # Get GPU model with the built-in communication hook. + gpu_model = self._gpu_model_with_builtin_ddp_comm_hook( + process_group, comm_hook_type, gradient_as_bucket_view + ) + + # check whether the grads are equal to what DDP without hook would return. + self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_hook_nccl(self): + self._test_ddp_comm_hook_allreduce_hook_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_default_ddp_comm_hooks_nccl(self): + self._test_default_ddp_comm_hooks_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_fp16_compress_wrapper_nccl(self): + self._test_fp16_compress_wrapper() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_builtin_ddp_comm_hooks_nccl(self): + self._test_builtin_ddp_comm_hooks_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_powerSGD_ddp_comm_hook_nccl(self): + self._test_powerSGD_ddp_comm_hook_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self): + self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_default_ddp_comm_hooks_nccl_is_view(self): + self._test_default_ddp_comm_hooks_nccl(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_fp16_compress_wrapper_is_view(self): + self._test_fp16_compress_wrapper(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_builtin_ddp_comm_hooks_nccl_grad_is_view(self): + self._test_builtin_ddp_comm_hooks_nccl(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_powerSGD_ddp_comm_hook_nccl_grad_is_view(self): + self._test_powerSGD_ddp_comm_hook_nccl(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self): + """ + This unit test verifies whether a DDP communication hook that calls allreduce and then + multiplies the result by ten and divides by two gives the expected result. + """ + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def allreduce_with_then_hook( + state: object, bucket: dist.GradBucket + ) -> torch.futures.Future: + tensors = [bucket.get_tensor() / self.world_size] + fut = process_group.allreduce(tensors).get_future() + + def mult(fut): + # Multiply the result by 10. + return [10 * t for t in fut.value()] + + def div(fut): + # Divide the result by 2. + return [0.5 * t for t in fut.value()] + + return fut.then(mult).then(div) + + # Get GPU model with allreduce_with_then_hook registered. + gpu_model = self._gpu_model_with_ddp_comm_hook( + process_group, allreduce_with_then_hook + ) + + # check whether the grads are equal to what allreduce returns multuplied by 5. + # without the comm_hook, result would be still 0.25 * torch.ones(2, 2). + self._run_and_verify_hook(gpu_model, 8, 1.25 * torch.ones(2, 2)) + + class AcceptsParam(torch.nn.Module): + def __init__(self, p, factor): + super().__init__() + self.a = p + self.f = factor + + def forward(self, input): + return input + self.a * self.f + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_weight_sharing(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + size = 2048 * 2048 + dev = self.rank + world = self.world_size + + p = torch.nn.Parameter(torch.randn(size, requires_grad=True)) + + for try_set_to_none, use_bucket_view in product((False, True), (False, True)): + m = torch.nn.Sequential(self.AcceptsParam(p, dev + 1), + self.AcceptsParam(p, dev + 1)).cuda(dev) + + m = torch.nn.parallel.DistributedDataParallel(m, + bucket_cap_mb=1, + gradient_as_bucket_view=use_bucket_view, + device_ids=[dev], + process_group=process_group) + + for i in range(3): + m.zero_grad(set_to_none=try_set_to_none) + m(1).sum().backward() + + # Each param value is multiplied by "rank + 1" twice in forward, so the grad + # values produced by a particular rank should be 2. * (rank + 1). + # Summing these over ranks and dividing by world size gives the expected result: + analytic = torch.full_like(p, 2. * (world * (world + 1.) / 2.) / world, device=dev) + for name, p in m.named_parameters(): + self.assertEqual(p.grad, analytic, "mismatch at " + name + ".grad for " + + "set_to_none = {}, use_bucket_view = {}".format(try_set_to_none, + use_bucket_view)) + + # A list of tests for ddp with activation checkpointing + # when gradient_as_bucket_view=True, False. + # Most of the tests are referred to + # https://github.com/facebookresearch/fairscale/blob/master/tests/nn/pipe/test_checkpoint_ddp.py + class CheckpointOnceModule(nn.Module): + def __init__(self): + super().__init__() + self.l1 = nn.Linear(2000, 2000) + self.l2 = nn.Linear(2000, 2000) + + def forward(self, inp): + x = self.l1(inp) + x = checkpoint(self.l2, x) + return x + + class CheckpointTwiceModule(CheckpointOnceModule): + def __init__(self): + super().__init__() + + def forward(self, inp): + x = self.l1(inp) + x = checkpoint(self.l2, x) + x = checkpoint(self.l2, x) + return x + + def _test_ddp_checkpointing(self, checkpoint_once, process_group, use_bucket_view, find_unused_parameters=False): + # to reprodce the same training results + torch.cuda.set_device(self.rank) + torch.manual_seed(31415) + if checkpoint_once: + model = self.CheckpointOnceModule().cuda() + else: + model = self.CheckpointTwiceModule().cuda() + model = nn.parallel.DistributedDataParallel(model, + bucket_cap_mb=1, + gradient_as_bucket_view=use_bucket_view, + device_ids=[self.rank], + process_group=process_group, + find_unused_parameters=find_unused_parameters) + input_tensor = torch.rand((64, 2000), device="cuda", requires_grad=True) + output_tensor = model(input_tensor) + output_tensor.sum().backward() + return model + + # DDP works as expect when layer is checkpointed only once + @requires_nccl() + @unittest.skip("TODO: Test is always failing - https://github.com/pytorch/pytorch/issues/55071") + def test_ddp_checkpointing_once(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + for use_bucket_view in (True, False): + model = self._test_ddp_checkpointing(checkpoint_once=True, + process_group=process_group, + use_bucket_view=use_bucket_view) + norm = 0.0 + for p in model.parameters(): + self.assertTrue(p.grad is not None) + norm += p.grad.norm().item() + assert numpy.allclose(norm, 78053), norm + + # DDP will fail when there are unused_parameters in the model + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_checkpointing_unused_params(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + for use_bucket_view in (True, False): + with self.assertRaisesRegex( + RuntimeError, + "Expected to mark a variable ready only once.", + ): + model = self._test_ddp_checkpointing(checkpoint_once=True, + process_group=process_group, + use_bucket_view=use_bucket_view, + find_unused_parameters=True) + + # DDP will fail when the same layer is checkponted twice + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_ddp_checkpointing_twice(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + for use_bucket_view in (True, False): + with self.assertRaisesRegex( + RuntimeError, + "Expected to mark a variable ready only once.", + ): + model = self._test_ddp_checkpointing(checkpoint_once=False, + process_group=process_group, + use_bucket_view=use_bucket_view, + find_unused_parameters=True) + + # DDP works as expected if there is weight sharing among layers + @requires_nccl() + @unittest.skip("TODO: Test is always failing - https://github.com/pytorch/pytorch/issues/55071") + def test_ddp_checkpointing_weight_sharing(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + torch.cuda.set_device(self.rank) + for use_bucket_view in (True, False): + torch.manual_seed(31415) + l1 = nn.Linear(2000, 2000) + l2 = nn.Linear(2000, 2000) + l1.weight = l2.weight + model = nn.Sequential(l1, l2).cuda() + model = nn.parallel.DistributedDataParallel(model, + bucket_cap_mb=1, + gradient_as_bucket_view=use_bucket_view, + device_ids=[self.rank], + process_group=process_group) + input_tensor = torch.rand((64, 2000), device="cuda", requires_grad=True) + output_tensor = checkpoint(model, input_tensor) + output_tensor.sum().backward() + norm = 0.0 + for p in model.parameters(): + self.assertTrue(p.grad is not None) + norm += p.grad.norm().item() + assert numpy.allclose(norm, 57004), norm + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class NcclErrorHandlingTest(MultiProcessTestCase): + def setUp(self): + super(NcclErrorHandlingTest, self).setUp() + # Need to skip return code checking for these tests since the child + # processes don't exit cleanly. + self.skip_return_code_checks = [ + self.test_nccl_errors_blocking_abort.__wrapped__, + self.test_nccl_errors_blocking_sigkill.__wrapped__, + self.test_nccl_errors_blocking_sigterm.__wrapped__, + self.test_nccl_errors_blocking_nonzero_exit.__wrapped__, + ] + # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests + # that use NCCL_BLOCKING_WAIT will test it as expected. + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + self._fork_processes() + + def tearDown(self): + super(NcclErrorHandlingTest, self).tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def op_timeout_sec(self): + return 1 + + @property + def world_size(self): + return 3 + + @property + def blocking_wait_error_msg(self): + return "Caught collective operation timeout" + + def _run_all_reduce(self, pg): + pg.allreduce(torch.rand(10).cuda(self.rank)) + + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + @skip_if_rocm + def test_nccl_errors_nonblocking(self): + # Note: we unset and restore NCCL_ASYNC_ERROR_HANDLING for this test + # since test_c10d_common runs with async error handling by default, but this + # tests behavior when it is not enabled. + prev_nccl_async_error_handling = os.environ.get("NCCL_ASYNC_ERROR_HANDLING", None) + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + process_group.allreduce(torch.rand(10).cuda(self.rank)) + if self.rank == 0: + # This allreduce does not block Python thread as allreduce enqueues + # the cuda operation, and then wait only blocks the current cuda + # stream. + work = process_group.allreduce(torch.rand(10).cuda(self.rank)) + work.wait() + + # Now the work scheduled next should hang forever since the previous + # allreduce will never complete. + t = threading.Thread(target=self._run_all_reduce, args=(process_group,)) + t.daemon = True + t.start() + t.join(int(get_timeout(self.id()) / 5)) + self.assertTrue(t.is_alive()) + + if prev_nccl_async_error_handling is not None: + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = prev_nccl_async_error_handling + + def _test_nccl_errors_blocking(self, func): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL( + store, + self.rank, + self.world_size, + timeout=timedelta(seconds=self.op_timeout_sec), + ) + process_group.allreduce(torch.rand(10).cuda(self.rank)) + if self.rank == 0: + work = process_group.allreduce(torch.rand(10).cuda(self.rank)) + with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg): + # Operation would time out in blocking mode. + work.wait() + # Run some GPU operations to make sure cuda has not gotten stuck. + # It was observed cuda could get stuck if NCCL communicators were + # not properly aborted before throwing RuntimeError. + a = torch.rand(10).cuda(self.rank) + elif self.rank == 1: + # Clean up structures (ex: files for FileStore before going down) + del process_group + func() + else: + # Wait for timeout + time.sleep(2 * self.op_timeout_sec) + + # Now verify communicators on this rank have been aborted by the watchdog thread. + self._wait_for_comm_abort(process_group) + + @with_nccl_blocking_wait + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + @skip_if_rocm + def test_nccl_errors_blocking_clean_exit(self): + self._test_nccl_errors_blocking(lambda: sys.exit(0)) + + @with_nccl_blocking_wait + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + @skip_if_rocm + def test_nccl_errors_blocking_nonzero_exit(self): + self._test_nccl_errors_blocking(lambda: sys.exit(1)) + + @with_nccl_blocking_wait + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + @skip_if_rocm + def test_nccl_errors_blocking_abort(self): + self._test_nccl_errors_blocking(lambda: os.abort()) + + @with_nccl_blocking_wait + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + @skip_if_rocm + def test_nccl_errors_blocking_sigkill(self): + self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL)) + + @with_nccl_blocking_wait + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + @skip_if_rocm + def test_nccl_errors_blocking_sigterm(self): + self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM)) + + @with_nccl_blocking_wait + @requires_nccl() + @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") + @skip_if_lt_x_gpu(3) + def test_nccl_blocking_wait_with_barrier(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL( + store, + self.rank, + self.world_size, + timeout=timedelta(seconds=self.op_timeout_sec), + ) + process_group.barrier().wait() + if self.rank == 0: + with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg): + # This should timeout + process_group.barrier().wait() + + def _run_invalid_nccl_blocking_wait_env(self, val): + os.environ["NCCL_BLOCKING_WAIT"] = val + store = c10d.FileStore(self.file_name, self.world_size) + with self.assertRaises(RuntimeError): + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + @requires_nccl() + @skip_if_lt_x_gpu(3) + def test_invalid_nccl_blocking_wait_env(self): + self._run_invalid_nccl_blocking_wait_env("abc") + self._run_invalid_nccl_blocking_wait_env("-1") + self._run_invalid_nccl_blocking_wait_env("2147483647") + self._run_invalid_nccl_blocking_wait_env("4294967295") + + def _wait_for_comm_abort(self, process_group): + """ + Waits for the watchdog thread to abort communicators for the process group. + """ + while True: + try: + process_group.allreduce(torch.rand(10).cuda(self.rank)) + except Exception as e: + if "NCCL communicator was aborted" in str(e): + return + else: + raise e + time.sleep(1) + + @with_nccl_blocking_wait + @requires_nccl() + @skip_if_lt_x_gpu(3) + def test_nccl_timeout(self): + store = c10d.FileStore(self.file_name, self.world_size) + + # Initialize process_group. + timeout = 1 + process_group = c10d.ProcessGroupNCCL( + store, self.rank, self.world_size, timeout=timedelta(seconds=timeout) + ) + process_group.allreduce(torch.rand(10).cuda(self.rank)).wait() + + if self.rank == 0: + # This should timeout in about 1 second. + start = time.time() + # Watchdog may abort timed out work resulting in NCCL error instead of operation timed out. + with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg): + process_group.allreduce(torch.rand(10).cuda(self.rank)).wait() + else: + # Sleep to ensure timeout. + time.sleep(2 * timeout) + + self._wait_for_comm_abort(process_group) + + +@unittest.skipIf( + TEST_WITH_TSAN, + "TSAN is not fork-safe since we're forking in a multi-threaded environment", +) +class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase): + + def setUp(self): + super(CommTest, self).setUp() + # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests + # that use NCCL_BLOCKING_WAIT will test it as expected. + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + if sys.platform == "win32": + self._spawn_processes() + else: + self._fork_processes() + + def tearDown(self): + super(CommTest, self).tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + def _test_broadcast_coalesced(self, process_group, device, root_rank): + half = torch.float16 + + # No support for float16 for CPU tensors + if device == torch.device("cpu"): + half = torch.float32 + + target = torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float64, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + + # The tensors to pass to broadcast are idential to the target + # only on the process that is the root of the broadcast. + if self.rank == root_rank: + tensors = list(tensor.clone() for tensor in target) + else: + tensors = list(torch.zeros_like(tensor) for tensor in target) + + if self.rank != root_rank: + self.assertNotEqual(tensors, target) + + c10d._broadcast_coalesced( + process_group, tensors, buffer_size=256, src=root_rank + ) + + if self.rank != root_rank: + self.assertEqual(tensors, target) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_broadcast_coalesced_nccl(self): + store = c10d.FileStore(self.file_name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + device = torch.device("cuda:%d" % self.rank) + ranks = [0, 1] + for root_rank in ranks: + self._test_broadcast_coalesced(process_group, device, root_rank) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_sequence_num_set_default_pg_nccl(self): + torch.cuda.set_device(self.rank) + self._test_sequence_num_set_default_pg(backend="nccl") + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_sequence_num_set_nccl_new_group(self): + torch.cuda.set_device(self.rank) + self._test_sequence_num_set_new_group(backend="nccl") + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_pass_nccl_options_high_priority_stream(self): + pg_opts = c10d.ProcessGroupNCCL.Options() + pg_opts.is_high_priority_stream = True + + store = c10d.FileStore(self.file_name, self.world_size) + # Test init_process_group accepts options + dist.init_process_group( + "nccl", + world_size=self.world_size, + rank=self.rank, + store=store, + pg_options=pg_opts + ) + + # Test with new_group + pg = c10d.new_group([0, 1], pg_options=pg_opts) + # test if the process group constructed with high priority stream + self.assertTrue(pg.options.is_high_priority_stream) + # test the process group works as expected + t = torch.tensor([self.rank + 1] * 10).cuda(self.rank) + pg.allreduce(t).wait() + expected_tensor = torch.tensor([3] * 10).cuda(self.rank) + self.assertEqual(expected_tensor, t) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_nccl_barrier(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", rank=self.rank, world_size=self.world_size, store=store + ) + + t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) + c10d.all_reduce(t) + expected_tensor = torch.tensor([3] * 10).cuda(2 * self.rank) + self.assertEqual(expected_tensor, t) + + # Test with new_group + pg = c10d.new_group([0, 1]) + t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) + pg.allreduce(t).wait() + self.assertEqual(expected_tensor, t) + + pg = c10d.new_group([0]) + if self.rank == 0: + t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) + expected_tensor = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) + pg.allreduce(t).wait() + self.assertEqual(expected_tensor, t) + + pg = c10d.new_group([1]) + if self.rank == 1: + t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) + expected_tensor = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank) + pg.allreduce(t).wait() + self.assertEqual(expected_tensor, t) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_nccl_barrier_timeout(self): + store = c10d.FileStore(self.file_name, self.world_size) + if self.rank == 0: + with self.assertRaisesRegex( + RuntimeError, "Timed out initializing process group" + ): + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store, + timeout=timedelta(seconds=1), + ) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_nccl_barrier_timeout_new_group(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store, + timeout=timedelta(seconds=1), + ) + + if self.rank == 0: + with self.assertRaisesRegex( + RuntimeError, "Timed out initializing process group" + ): + c10d.new_group([0, 1], timeout=timedelta(seconds=1)) + + with self.assertRaisesRegex( + RuntimeError, "Timed out initializing process group" + ): + c10d.new_group([0], timeout=timedelta(seconds=1)) + + @requires_nccl() + @skip_if_lt_x_gpu(4) + def test_nccl_barrier_timeout_new_group_non_member(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store, + timeout=timedelta(seconds=1), + ) + + if self.rank == 1: + with self.assertRaisesRegex( + RuntimeError, "Timed out initializing process group" + ): + c10d.new_group([0, 1], timeout=timedelta(seconds=1)) + + with self.assertRaisesRegex( + RuntimeError, "Timed out initializing process group" + ): + c10d.new_group([0], timeout=timedelta(seconds=1)) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", rank=self.rank, world_size=self.world_size, store=store + ) + + c10d.barrier(device_ids=[self.rank]) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + def test_nccl_barrier_device_ids_function_argument(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", rank=self.rank, world_size=self.world_size, store=store + ) + + with self.assertRaisesRegex(RuntimeError, "Invalid function argument"): + c10d.barrier(device_ids=self.rank) + + +if __name__ == "__main__": + assert ( + not torch.cuda._initialized + ), "test_distributed must not have initialized CUDA context on main process" + + run_tests() diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py index 3bf1f6a3d22..5cb37fc4d35 100644 --- a/test/distributed/test_c10d_spawn.py +++ b/test/distributed/test_c10d_spawn.py @@ -1,21 +1,11 @@ -import copy -import os import sys import tempfile -import unittest import torch import torch.distributed as c10d import torch.multiprocessing as mp -import torch.nn as nn - -from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU -from torch.testing._internal.common_distributed import requires_gloo, \ - create_device, MultiProcessTestCase, skip_if_lt_x_gpu -from torch.testing._internal.common_utils import TestCase, load_tests, \ - run_tests -from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN - +from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN +from torch.testing._internal.common_utils import load_tests # Torch distributed.nn is not available in windows # check #42095, it errors on import. @@ -25,7 +15,6 @@ try: except ImportError: _torch_dist_nn_available = False - # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -34,38 +23,14 @@ if not c10d.is_available(): print('c10d not available, skipping tests', file=sys.stderr) sys.exit(0) - if NO_MULTIPROCESSING_SPAWN: print('spawn not available, skipping tests', file=sys.stderr) sys.exit(0) -NO_NCCL = not hasattr(c10d, "ProcessGroupNCCL") - - -class ProcessGroupShareTensorTest(TestCase): - +class AbstractProcessGroupShareTensorTest(object): world_size = 2 - @classmethod - def opts(cls, threads=2): - opts = c10d.ProcessGroupGloo._Options() - opts._timeout = 5.0 - opts._devices = [create_device(interface='lo')] - opts._threads = threads - return opts - - @classmethod - def _init_pg_gloo(cls, rank, filename, world_size): - store = c10d.FileStore(filename, world_size) - return c10d.ProcessGroupGloo( - store, rank, world_size, ProcessGroupShareTensorTest.opts()) - - @classmethod - def _init_pg_nccl(cls, rank, filename, world_size): - store = c10d.FileStore(filename, world_size) - return c10d.ProcessGroupNCCL(store, rank, world_size) - def _test_multiprocess(self, f, shared_tensors, init_pg, n_output): ws = self.world_size # file store will delete the test file on destruction @@ -109,24 +74,6 @@ class ProcessGroupShareTensorTest(TestCase): c2p.put((rank, torch.zeros(2, 2), xs[0].to("cpu"))) p2c.get() - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - def test_shared_broadcast_gloo(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_broadcast_process, - [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_gloo, - 1) - - - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - @unittest.skipIf(NO_NCCL, "NCCL needed") - def test_shared_broadcast_nccl(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_broadcast_process, - [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_nccl, - 1) - @classmethod def _test_allreduce_process( cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): @@ -136,44 +83,6 @@ class ProcessGroupShareTensorTest(TestCase): c2p.put((rank, torch.ones(2, 2) * 2, xs[0].to("cpu"))) p2c.get() - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - def test_shared_allreduce_gloo(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_allreduce_process, - [torch.ones(2, 2).to(i) for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_gloo, - 1) - - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - @unittest.skipIf(NO_NCCL, "NCCL needed") - def test_shared_allreduce_nccl(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_allreduce_process, - [torch.ones(2, 2).to(i) for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_nccl, - 1) - - @classmethod - def _test_reduce_process( - cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): - pg = init_pg(rank, filename, world_size) - x = shared_tensors[rank] - pg.reduce(x, root=0, op=c10d.ReduceOp.SUM).wait() - if rank == 0: - c2p.put((rank, torch.ones(2, 2) * 2, x.to("cpu"))) - else: - c2p.put((rank, torch.ones(2, 2), x.to("cpu"))) - p2c.get() - - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - @unittest.skipIf(NO_NCCL, "NCCL needed") - def test_shared_reduce_nccl(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_reduce_process, - [torch.ones(2, 2).to(i) for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_nccl, - 1) - @classmethod def _test_allgather_process( cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): @@ -185,322 +94,3 @@ class ProcessGroupShareTensorTest(TestCase): c2p.put((rank, torch.ones(2, 2) * i, ys[0][i].to("cpu"))) p2c.get() - - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - def test_shared_allgather_gloo(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_allgather_process, - [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_gloo, - self.world_size) - - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - @unittest.skipIf(NO_NCCL, "NCCL needed") - def test_shared_allgather_nccl(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_allgather_process, - [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], - ProcessGroupShareTensorTest._init_pg_nccl, - self.world_size) - - @classmethod - def _test_allgather_chunk_process( - cls, rank, filename, shared_tensor, world_size, init_pg, c2p, p2c): - pg = init_pg(rank, filename, world_size) - chunks = torch.chunk(shared_tensor, world_size, dim=0) - x = chunks[rank] - ys = [torch.zeros_like(x) for _ in range(world_size)] - pg.allgather(ys, x).wait() - c2p.put((rank, chunks[0].to("cpu"), ys[0].to("cpu"))) - c2p.put((rank, chunks[1].to("cpu"), ys[1].to("cpu"))) - p2c.get() - - @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") - def test_shared_allgather_chunk_gloo(self): - self._test_multiprocess( - ProcessGroupShareTensorTest._test_allgather_chunk_process, - torch.tensor(range(4)).reshape(2, 2), - ProcessGroupShareTensorTest._init_pg_gloo, - self.world_size) - - -@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment") -class DistributedDataParallelSingleProcessTest(TestCase): - def setUp(self): - self.rank = 0 - self.world_size = 1 - self.file = tempfile.NamedTemporaryFile(delete=False) # noqa: P201 - - def tearDown(self): - try: - os.remove(self.file.name) - except OSError: - pass - - def _test_base(self, net, inp, check_allclose=True): - store = c10d.FileStore(self.file.name, self.world_size) - process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - if inp[0].is_cuda: - device_ids = [torch.cuda.current_device()] - else: - device_ids = None - - ddp = nn.parallel.DistributedDataParallel( - copy.deepcopy(net), - device_ids=device_ids, - process_group=process_group - ) - - net_opt = torch.optim.Adam(net.parameters(), lr=0.001) - ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001) - - for i, j in zip(ddp.parameters(), net.parameters()): - self.assertTrue(i.allclose(j)) - - for _ in range(10): - net_out = net(*inp) - ddp_out = ddp(*inp) - - net_out.sum().backward() - ddp_out.sum().backward() - - net_opt.step() - ddp_opt.step() - - if check_allclose: - for i, j in zip(ddp.parameters(), net.parameters()): - self.assertTrue(i.allclose(j)) - - @requires_gloo() - def test_cpu(self): - self._test_base(nn.Linear(2, 2), [torch.randn(30, 2)]) - - @requires_gloo() - @unittest.skipIf(not TEST_CUDA, "At least 1 CUDA GPUS needed") - def test_cuda(self): - self._test_base(nn.Linear(2, 2).to(0), [torch.randn(30, 2).to(0)]) - - @requires_gloo() - @unittest.skipIf(not TEST_CUDA, "At least 1 CUDA GPUS needed") - def test_rnn(self): - # This test is inspired by the bug reported in - # https://github.com/pytorch/pytorch/issues/36268 - BATCH_SIZE = 12 # Divisible by 2, 3, 4 - INPUT_DIM = 256 - OUTPUT_DIM = 256 - HIDDEN_DIM = 256 - N_LAYERS = 3 - SEQ_LEN = 100 - - class Net(nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers): - super(Net, self).__init__() - self.input_dim = input_dim - self.hidden_dim = hidden_dim - self.output_dim = output_dim - self.hidden_layers = hidden_layers - - self.lstm = nn.LSTM(input_dim, hidden_dim, hidden_layers, batch_first=True) - self.h2o = nn.Linear(hidden_dim, output_dim) - - def forward(self, x, y): - self.lstm.flatten_parameters() - h_t, _ = self.lstm(x) - output = self.h2o(h_t) - loss = nn.functional.mse_loss(output, y) - return loss - - net = Net(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(0) - inp = [ - torch.randn((BATCH_SIZE, SEQ_LEN, INPUT_DIM)).to(0), - torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0) - ] - - # Not checking result allclose as the parameter inconsistency exist - # prior to this change. See #37079 - self._test_base(net, inp, check_allclose=False) - - -class TestDistributedNNFunctions(MultiProcessTestCase): - def setUp(self): - if not _torch_dist_nn_available: - raise unittest.SkipTest("torch.distributed.nn is not available") - super(TestDistributedNNFunctions, self).setUp() - self._spawn_processes() - - def tearDown(self): - super(TestDistributedNNFunctions, self).tearDown() - try: - os.remove(self.file_name) - except OSError: - pass - - @property - def op_timeout_sec(self): - return 1 - - @property - def world_size(self): - return 2 - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_broadcast(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x = torch.ones(5, 5, device=device) + self.rank - x.requires_grad = True - y = torch.distributed.nn.broadcast(x, 1) - self.assertEqual(y, 1 + torch.ones(5, 5)) - z = y.sin().sum() - z.backward() - # We can't check the gradient of communications numerically so we have to do some calculations - if self.rank == 1: - self.assertEqual(x.grad, 2 * torch.cos(x)) - elif self.rank == 0: - self.assertEqual(x.grad, torch.zeros(5, 5, device=device)) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_gather(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x = torch.ones(5, 5, device=device) + self.rank - x.requires_grad = True - tensors = torch.distributed.nn.gather(x, 1) - if self.rank == 1: - for i, t in enumerate(tensors): - self.assertEqual(t, torch.ones(5, 5, device=device) + i) - elif self.rank == 0: - for i, t in enumerate(tensors): - zeros = torch.zeros(5, 5, device=device) - self.assertEqual(t, zeros) - y = torch.sum(torch.stack(tensors), axis=0) - z = y.sin().sum() - z.backward() - - # Test gradient - x_s = 3 * torch.ones(5, 5, device=device) - self.assertEqual(x.grad, x_s.cos()) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_scatter(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x0 = torch.ones(5, 5, device=device) - x1 = torch.ones(5, 5, device=device) + 1 - x0.requires_grad = True - x1.requires_grad = True - - y = torch.distributed.nn.scatter([x0, x1], 1) - if self.rank == 1: - self.assertEqual(y, 1 + torch.ones(5, 5, device=device)) - elif self.rank == 0: - self.assertEqual(y, torch.ones(5, 5, device=device)) - z = y.sin().sum() - z.backward() - - # Test gradient - if self.rank == 1: - x0_s = torch.ones(5, 5, device=device).cos() - x1_s = (2 * torch.ones(5, 5, device=device)).cos() - self.assertEqual(x0.grad, x0_s) - self.assertEqual(x1.grad, x1_s) - if self.rank == 0: - self.assertEqual(x0.grad, torch.zeros(5, 5, device=device)) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_reduce(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x = torch.ones(5, 5, device=device) + self.rank - x.requires_grad = True - y = torch.distributed.nn.reduce(x, 1, op=c10d.ReduceOp.SUM) - - if self.rank == 1: - self.assertEqual(y, 3 * torch.ones(5, 5, device=device)) - - z = y.sin().sum() - z.backward() - # Gradients are broadcasted to both ranks - x_g = (3 * torch.ones(5, 5, device=device)).cos() - self.assertEqual(x.grad, x_g) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_allreduce(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x = torch.ones(5, 5, device=device) + self.rank - x.requires_grad = True - y = torch.distributed.nn.all_reduce(x, op=c10d.ReduceOp.SUM) - - self.assertEqual(y, 3 * torch.ones(5, 5, device=device)) - - z = y.sin().sum() - z.backward() - x_g = 2 * (3 * torch.ones(5, 5, device=device)).cos() - self.assertEqual(x.grad, x_g) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_all_gather(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x = torch.ones(5, 5, device=device) + self.rank - x.requires_grad = True - tensors = torch.distributed.nn.all_gather(x) - for i, t in enumerate(tensors): - self.assertEqual(t, torch.ones(5, 5, device=device) + i) - y = torch.sum(torch.stack(tensors), axis=0) - z = y.sin().sum() - z.backward() - - x_s = 2 * (3 * torch.ones(5, 5, device=device)).cos() - self.assertEqual(x.grad, x_s) - - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_all_to_all(self): - store = c10d.FileStore(self.file_name, self.world_size) - # This is required because these functions calls directly to the .dist and needs - # the world to be initialized - c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') - device = torch.device(f"cuda:{self.rank}") - x0 = torch.ones(5, 5, device=device) + 2 * self.rank - x1 = torch.ones(5, 5, device=device) + 2 * self.rank - x0.requires_grad = True - x1.requires_grad = True - tensors = torch.distributed.nn.all_to_all([x0, x1]) - for i, t in enumerate(tensors): - self.assertEqual(t, torch.ones(5, 5, device=device) + 2 * i) - y = torch.sum(torch.stack(tensors), axis=0) - z = y.sin().sum() - z.backward() - x_s = (4 * torch.ones(5, 5, device=device)).cos() - self.assertEqual(x0.grad, x_s) - self.assertEqual(x1.grad, x_s) - - -if __name__ == '__main__': - run_tests() diff --git a/test/run_test.py b/test/run_test.py index fab61397130..909b51bc47b 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -41,7 +41,9 @@ TESTS = [ 'test_cpp_extensions_aot_no_ninja', 'test_cpp_extensions_aot_ninja', 'test_cpp_extensions_jit', - 'distributed/test_c10d', + 'distributed/test_c10d_common', + 'distributed/test_c10d_gloo', + 'distributed/test_c10d_nccl', 'distributed/test_jit_c10d', 'distributed/test_c10d_spawn', 'test_cuda', @@ -284,7 +286,9 @@ TARGET_DET_LIST = [ 'test_utils', 'test_multiprocessing', 'test_tensorboard', - 'distributed/test_c10d', + 'distributed/test_c10d_common', + 'distributed/test_c10d_gloo', + 'distributed/test_c10d_nccl', 'distributed/test_jit_c10d', 'distributed/test_c10d_spawn', 'test_quantization', diff --git a/torch/distributed/CONTRIBUTING.md b/torch/distributed/CONTRIBUTING.md index 367a8f4f006..e76244e63d9 100644 --- a/torch/distributed/CONTRIBUTING.md +++ b/torch/distributed/CONTRIBUTING.md @@ -74,7 +74,9 @@ All the unit tests can be found under the [test/distributed](../../test/distribu ``` # Run the c10d unit test. -python test/distributed/test_c10d.py +python test/distributed/test_c10d_common.py +python test/distributed/test_c10d_gloo.py +python test/distributed/test_c10d_nccl.py # Run distributed tests, including tests for Distributed Data Parallel python test/run_test.py --verbose -i distributed/test_distributed_fork diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index 854a9e81c58..e8142c97561 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -1053,7 +1053,7 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork { // TODO: This is a massive hack! There is some confusion about // Variable/Tensor inside the body of this function. Turning off // grad smooths over the confusion for now. This fixes - // test/test_c10d.py ProcessGroupGlooTest.test_sparse_allreduce_basics + // test/test_c10d_gloo.py ProcessGroupGlooTest.test_sparse_allreduce_basics // // The correct fix is to stop allocating tensors that are not variables, // but to conveniently do this c10d must depend on torch not ATen