mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
#79043 CC @pritamdamania87 @ptrblck Pull Request resolved: https://github.com/pytorch/pytorch/pull/79060 Approved by: https://github.com/pritamdamania87
2755 lines
106 KiB
Python
2755 lines
106 KiB
Python
# Owner(s): ["oncall: distributed"]
|
|
|
|
import copy
|
|
import math
|
|
import os
|
|
import random
|
|
import signal
|
|
import sys
|
|
import tempfile
|
|
import threading
|
|
import time
|
|
from contextlib import contextmanager
|
|
from datetime import timedelta
|
|
from itertools import product
|
|
from unittest import mock
|
|
|
|
import torch
|
|
import torch.distributed as c10d
|
|
|
|
if not c10d.is_available():
|
|
print("c10d not available, skipping tests", file=sys.stderr)
|
|
sys.exit(0)
|
|
|
|
import test_c10d_common
|
|
import torch.distributed as dist
|
|
import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
|
|
import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
|
|
import torch.nn.functional as F
|
|
import torch.testing._internal.common_utils as common
|
|
from test_c10d_common import gpus_for_rank, DoubleGpuNet, ConvNet, ModuleForDdpCommHook
|
|
from torch import nn
|
|
from torch.nn.parallel import DistributedDataParallel
|
|
from torch.testing._internal.common_distributed import (
|
|
MultiProcessTestCase,
|
|
init_multigpu_helper,
|
|
requires_nccl,
|
|
requires_gloo,
|
|
requires_nccl_version,
|
|
skip_if_lt_x_gpu,
|
|
get_timeout,
|
|
skip_if_rocm,
|
|
with_dist_debug_levels,
|
|
with_nccl_blocking_wait,
|
|
)
|
|
from torch.testing._internal.common_utils import (
|
|
TestCase,
|
|
run_tests,
|
|
retry_on_connect_failures,
|
|
TEST_WITH_DEV_DBG_ASAN,
|
|
TEST_WITH_ROCM,
|
|
sandcastle_skip,
|
|
sandcastle_skip_if,
|
|
)
|
|
|
|
if TEST_WITH_DEV_DBG_ASAN:
|
|
print(
|
|
"Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr
|
|
)
|
|
sys.exit(0)
|
|
|
|
# bfloat16 is only supported by CUDA 11+
|
|
BFLOAT16_AVAILABLE = (
|
|
torch.cuda.is_available()
|
|
and torch.version.cuda is not None
|
|
and int(torch.version.cuda.split('.')[0]) >= 11)
|
|
|
|
class RendezvousEnvTest(TestCase):
|
|
@retry_on_connect_failures
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(
|
|
torch.cuda.device_count() == 0, "No GPUs available, skipping test"
|
|
)
|
|
def test_common_errors(self):
|
|
vars = {
|
|
"WORLD_SIZE": "1",
|
|
"RANK": "0",
|
|
"MASTER_ADDR": "127.0.0.1",
|
|
"MASTER_PORT": str(common.find_free_port()),
|
|
}
|
|
|
|
class Env(object):
|
|
def __init__(self, vars):
|
|
self.env_patcher = mock.patch.dict(os.environ, vars, clear=True)
|
|
|
|
def __enter__(self):
|
|
self.env_patcher.start()
|
|
|
|
def __exit__(self, type, value, traceback):
|
|
self.env_patcher.stop()
|
|
|
|
def without(d, key):
|
|
d = d.copy()
|
|
d.pop(key)
|
|
return d
|
|
|
|
def withouts(d, keys):
|
|
d = d.copy()
|
|
for key in keys:
|
|
d.pop(key)
|
|
return d
|
|
|
|
with Env(without(vars, "WORLD_SIZE")):
|
|
self.assertEqual(None, os.environ.get("WORLD_SIZE"))
|
|
with self.assertRaisesRegex(ValueError, "WORLD_SIZE expected"):
|
|
gen = c10d.rendezvous("env://")
|
|
next(gen)
|
|
c10d.init_process_group(backend="nccl", world_size=1)
|
|
self.assertEqual(c10d.get_rank(), 0)
|
|
self.assertEqual(c10d.get_world_size(), 1)
|
|
c10d.destroy_process_group()
|
|
|
|
with Env(without(vars, "RANK")):
|
|
self.assertEqual(None, os.environ.get("RANK"))
|
|
with self.assertRaisesRegex(ValueError, "RANK expected"):
|
|
gen = c10d.rendezvous("env://")
|
|
next(gen)
|
|
c10d.init_process_group(backend="nccl", rank=0)
|
|
self.assertEqual(c10d.get_rank(), 0)
|
|
self.assertEqual(c10d.get_world_size(), 1)
|
|
c10d.destroy_process_group()
|
|
|
|
with Env(withouts(vars, ["RANK", "WORLD_SIZE"])):
|
|
self.assertEqual(None, os.environ.get("RANK"))
|
|
self.assertEqual(None, os.environ.get("WORLD_SIZE"))
|
|
c10d.init_process_group(backend="nccl", rank=0, world_size=1)
|
|
self.assertEqual(c10d.get_rank(), 0)
|
|
self.assertEqual(c10d.get_world_size(), 1)
|
|
c10d.destroy_process_group()
|
|
|
|
with Env(vars):
|
|
c10d.init_process_group(backend="nccl")
|
|
self.assertEqual(c10d.get_rank(), 0)
|
|
self.assertEqual(c10d.get_world_size(), 1)
|
|
c10d.destroy_process_group()
|
|
|
|
with Env(without(vars, "MASTER_ADDR")):
|
|
self.assertEqual(None, os.environ.get("MASTER_ADDR"))
|
|
with self.assertRaisesRegex(ValueError, "MASTER_ADDR expected"):
|
|
gen = c10d.rendezvous("env://")
|
|
next(gen)
|
|
|
|
with Env(without(vars, "MASTER_PORT")):
|
|
self.assertEqual(None, os.environ.get("MASTER_PORT"))
|
|
with self.assertRaisesRegex(ValueError, "MASTER_PORT expected"):
|
|
gen = c10d.rendezvous("env://")
|
|
next(gen)
|
|
|
|
with Env(without(vars, "WORLD_SIZE")):
|
|
self.assertEqual(None, os.environ.get("WORLD_SIZE"))
|
|
gen = c10d.rendezvous("env://?world_size={}".format(1))
|
|
_, _, size = next(gen)
|
|
self.assertEqual(size, 1)
|
|
|
|
with Env(without(vars, "RANK")):
|
|
self.assertEqual(None, os.environ.get("RANK"))
|
|
gen = c10d.rendezvous("env://?rank={}".format(0))
|
|
_, rank, _ = next(gen)
|
|
self.assertEqual(rank, 0)
|
|
|
|
with Env(withouts(vars, ["RANK", "WORLD_SIZE"])):
|
|
self.assertEqual(None, os.environ.get("RANK"))
|
|
self.assertEqual(None, os.environ.get("WORLD_SIZE"))
|
|
gen = c10d.rendezvous("env://?rank={}&world_size={}".format(0, 1))
|
|
_, rank, size = next(gen)
|
|
self.assertEqual(rank, 0)
|
|
self.assertEqual(size, 1)
|
|
|
|
|
|
class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
|
|
@requires_nccl()
|
|
@retry_on_connect_failures
|
|
@sandcastle_skip_if(
|
|
torch.cuda.device_count() == 0, "No GPUs available, skipping test"
|
|
)
|
|
def test_default_store_timeout_nccl(self):
|
|
self._test_default_store_timeout("nccl")
|
|
|
|
|
|
class ProcessGroupNCCLNoGPUTest(TestCase):
|
|
MAIN_PROCESS_RANK = 0
|
|
|
|
def setUp(self):
|
|
self.rank = self.MAIN_PROCESS_RANK
|
|
self.world_size = 1
|
|
self.file = tempfile.NamedTemporaryFile(delete=False)
|
|
|
|
def tearDown(self):
|
|
pass
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(
|
|
torch.cuda.device_count() > 0, "GPUs are available, skipping test"
|
|
)
|
|
def test_init_no_gpus(self):
|
|
store = c10d.FileStore(self.file.name, self.world_size)
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"
|
|
):
|
|
c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
|
|
class ProcessGroupNCCLTest(MultiProcessTestCase):
|
|
def _create_process_group_nccl(self, store, opts):
|
|
# create nccl processgroup with opts
|
|
c10d.init_process_group(
|
|
"nccl",
|
|
world_size=self.world_size,
|
|
rank=self.rank,
|
|
store=store,
|
|
pg_options=opts)
|
|
pg = c10d.distributed_c10d._get_default_group()
|
|
return pg
|
|
|
|
def opts(self, high_priority_stream=False):
|
|
opts = c10d.ProcessGroupNCCL.Options()
|
|
opts.is_high_priority_stream = high_priority_stream
|
|
return opts
|
|
|
|
def setUp(self):
|
|
super(ProcessGroupNCCLTest, self).setUp()
|
|
# NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
|
|
# that use NCCL_BLOCKING_WAIT will test it as expected.
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
|
# self.num_gpus = torch.cuda.device_count()
|
|
self._spawn_processes()
|
|
|
|
def tearDown(self):
|
|
super(ProcessGroupNCCLTest, self).tearDown()
|
|
try:
|
|
os.remove(self.file_name)
|
|
except OSError:
|
|
pass
|
|
|
|
@property
|
|
def world_size(self):
|
|
return 2
|
|
|
|
@property
|
|
def rank_to_GPU(self):
|
|
# return rank to GPU map
|
|
return init_multigpu_helper(self.world_size, "nccl")
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_empty_tensors(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_idx = self.rank_to_GPU[self.rank][0]
|
|
|
|
xs = [torch.FloatTensor([]).cuda(local_device_idx)]
|
|
pg.broadcast(xs).wait()
|
|
self.assertEqual(0, xs[0].numel())
|
|
|
|
pg.allreduce(xs).wait()
|
|
self.assertEqual(0, xs[0].numel())
|
|
|
|
pg.reduce(xs).wait()
|
|
self.assertEqual(0, xs[0].numel())
|
|
|
|
ys = [[torch.FloatTensor([]).cuda(local_device_idx) for _ in range(self.world_size)]]
|
|
pg.allgather(ys, xs).wait()
|
|
for y in ys[0]:
|
|
self.assertEqual(0, y.numel())
|
|
|
|
ys = [torch.FloatTensor([]).cuda(local_device_idx)]
|
|
xs = [[torch.FloatTensor([]).cuda(local_device_idx) for _ in range(self.world_size)]]
|
|
pg.reduce_scatter(ys, xs).wait()
|
|
self.assertEqual(0, ys[0].numel())
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_broadcast_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
|
|
def broadcast(xs, rootRank, rootTensor):
|
|
opts = c10d.BroadcastOptions()
|
|
opts.rootRank = rootRank
|
|
opts.rootTensor = rootTensor
|
|
work = pg.broadcast(xs, opts)
|
|
work.wait()
|
|
return work.result()
|
|
|
|
# Every rank is root once
|
|
for i in range(self.world_size):
|
|
# Run with 1 input tensor
|
|
x = torch.tensor([self.rank]).cuda(self.rank_to_GPU[self.rank][0])
|
|
output = broadcast([x], i, 0)
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(torch.tensor([i]), output[0])
|
|
|
|
expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1)
|
|
xs = [torch.empty([i + 1, i + 1]).fill_(-1).cuda(device=device_idx) for device_idx in self.rank_to_GPU[self.rank]]
|
|
|
|
# test with multiple input tensors (multiple gpu in one rank)
|
|
for j in range(len(xs)):
|
|
if self.rank == i:
|
|
xs[j] = expected_tensor.cuda(device=self.rank_to_GPU[self.rank][j])
|
|
|
|
broadcast(xs, i, j)
|
|
|
|
for tensor in xs:
|
|
self.assertEqual(tensor, expected_tensor)
|
|
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_allreduce_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
device_count = torch.cuda.device_count()
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_id = self.rank_to_GPU[self.rank][0]
|
|
|
|
def allreduce(tensors, op):
|
|
opts = c10d.AllreduceOptions()
|
|
opts.reduceOp = op
|
|
work = pg.allreduce(tensors, opts)
|
|
work.wait()
|
|
|
|
# Sum
|
|
tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
|
|
|
|
allreduce(tensors, c10d.ReduceOp.SUM)
|
|
|
|
ndev = float(self.world_size)
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(
|
|
torch.tensor([ndev * (ndev + 1) / 2]),
|
|
tensors[0],
|
|
)
|
|
|
|
# Avg (only available for NCCL 2.10+)
|
|
if torch.cuda.nccl.version() >= (2, 10, 0):
|
|
tensors = [torch.tensor([self.rank + 1.]).cuda(local_device_id)]
|
|
|
|
allreduce(tensors, c10d.ReduceOp.AVG)
|
|
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(
|
|
torch.tensor([ndev * (ndev + 1.) / (2. * ndev)]),
|
|
tensors[0],
|
|
)
|
|
|
|
# Product
|
|
tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
|
|
|
|
allreduce(tensors, c10d.ReduceOp.PRODUCT)
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(
|
|
torch.tensor([float(math.factorial(self.world_size))]), tensors[0]
|
|
)
|
|
|
|
# Min
|
|
tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
|
|
|
|
allreduce(tensors, c10d.ReduceOp.MIN)
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(torch.tensor([1.0]), tensors[0])
|
|
|
|
# Max
|
|
tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
|
|
|
|
allreduce(tensors, c10d.ReduceOp.MAX)
|
|
self.assertEqual(torch.tensor([self.world_size]), tensors[0])
|
|
|
|
for op in (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR):
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Cannot use " + str(op) + " with NCCL"
|
|
):
|
|
allreduce(tensors, op)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_reduce_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_id = self.rank_to_GPU[self.rank][0]
|
|
|
|
def reduce(xs, rootRank, rootTensor, op=None):
|
|
opts = c10d.ReduceOptions()
|
|
opts.rootRank = rootRank
|
|
opts.rootTensor = rootTensor
|
|
if op:
|
|
opts.reduceOp = op
|
|
work = pg.reduce(xs, opts)
|
|
work.wait()
|
|
|
|
# for every root tensor
|
|
for rt in range(self.world_size):
|
|
tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
|
|
|
|
reduce(tensors, rt, 0)
|
|
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
if self.rank == rt:
|
|
self.assertEqualIgnoreType(
|
|
torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]),
|
|
tensors[0],
|
|
)
|
|
else:
|
|
self.assertEqualIgnoreType(
|
|
torch.tensor([self.rank + 1]),
|
|
tensors[0],
|
|
)
|
|
|
|
|
|
for op in (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR):
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Cannot use " + str(op) + " with NCCL"
|
|
):
|
|
reduce(tensors, self.rank, rt, op)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_allgather_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
|
|
def allgather(output_ts, input_ts):
|
|
work = pg.allgather(output_ts, input_ts)
|
|
return work.wait()
|
|
|
|
tensors = [torch.empty(2, 2).fill_(2).cuda(device=i) for i in local_device_ids]
|
|
output_tensors = []
|
|
expected_output = []
|
|
|
|
output_per_gpu = ([torch.empty(2, 2).fill_(-1)] * len(local_device_ids) * self.world_size)
|
|
expected_per_gpu = ([torch.empty(2, 2).fill_(2)] * len(local_device_ids) * self.world_size)
|
|
|
|
for gpu in local_device_ids:
|
|
output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
|
|
expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
|
|
|
|
result = allgather(output_tensors, tensors)
|
|
|
|
# Verification
|
|
self.assertEqual(output_tensors, expected_output)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_allgather_base_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_id = self.rank_to_GPU[self.rank][0]
|
|
|
|
def allgather_base(output_t, input_t):
|
|
work = pg._allgather_base(output_t, input_t)
|
|
work.wait()
|
|
|
|
# allgather_base is GPU number agnostic.
|
|
# Each rank contribute one tensor regardless of GPU counts
|
|
tensor = torch.tensor([self.rank]).cuda(local_device_id)
|
|
output_t = torch.empty((self.world_size), dtype=tensor.dtype).cuda(local_device_id)
|
|
|
|
allgather_base(output_t, tensor)
|
|
|
|
# Verification
|
|
self.assertEqual(torch.arange(self.world_size), output_t)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_allgather_base_basics(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_id = self.rank_to_GPU[self.rank][0]
|
|
|
|
def allgather_base(output_t, input_t):
|
|
work = pg._allgather_base(output_t, input_t)
|
|
work.wait()
|
|
|
|
# anticpate an error
|
|
with self.assertRaisesRegex(
|
|
RuntimeError,
|
|
"output tensor size must be equal to world_size times input tensor size",
|
|
):
|
|
tensor = torch.tensor([self.rank]).cuda(local_device_id)
|
|
output_t = torch.empty((self.world_size + 1), dtype=tensor.dtype).cuda(
|
|
local_device_id
|
|
)
|
|
# fails the check because output_t is not correctly sized
|
|
allgather_base(output_t, tensor)
|
|
|
|
# anticpate an error
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "output tensor must have the same type as input tensor"
|
|
):
|
|
tensor = torch.tensor([self.rank], dtype=torch.float).cuda(local_device_id)
|
|
output_t = torch.empty((self.world_size + 1), dtype=torch.long).cuda(
|
|
local_device_id
|
|
)
|
|
# fails the check because the dtype is different
|
|
allgather_base(output_t, tensor)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_gather_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
def gather(output_t, input_t, rootRank):
|
|
opts = c10d.GatherOptions()
|
|
opts.rootRank = rootRank
|
|
if rootRank == self.rank:
|
|
work = pg.gather(output_t, input_t, opts)
|
|
else:
|
|
work = pg.gather([], input_t, opts)
|
|
work.wait()
|
|
|
|
# init input
|
|
tensors = []
|
|
for device_id in local_device_ids:
|
|
tensors.append(torch.tensor([self.rank]).cuda(device_id))
|
|
|
|
# init output
|
|
output_ts = []
|
|
for idx in range(num_gpus):
|
|
gpu_idx = local_device_ids[idx]
|
|
output_ts.append([])
|
|
for rank in range(self.world_size):
|
|
output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
|
|
|
|
expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
|
|
for rank in range(self.world_size):
|
|
gather(output_ts, tensors, rank)
|
|
if rank == self.rank:
|
|
self.assertEqual(expected, output_ts)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_gather_stress(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
def gather(output_t, input_t, rootRank):
|
|
opts = c10d.GatherOptions()
|
|
opts.rootRank = rootRank
|
|
if rootRank == self.rank:
|
|
work = pg.gather(output_t, input_t, opts)
|
|
else:
|
|
work = pg.gather([], input_t, opts)
|
|
work.wait()
|
|
|
|
stress_length = 1000
|
|
|
|
# init input
|
|
tensors = []
|
|
for i in range(stress_length):
|
|
tensors.append([])
|
|
for device_id in local_device_ids:
|
|
tensors[i].append(torch.tensor([self.rank]).cuda(device_id))
|
|
|
|
# init output
|
|
output_ts = []
|
|
for i in range(stress_length):
|
|
output_ts.append([[] for _ in range(num_gpus)])
|
|
for idx, ls in enumerate(output_ts[i]):
|
|
gpu_idx = local_device_ids[idx]
|
|
for _ in range(self.world_size):
|
|
ls.append(torch.tensor([-1]).cuda(gpu_idx))
|
|
|
|
expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
|
|
for i in range(stress_length):
|
|
for rank in range(self.world_size):
|
|
gather(output_ts[i], tensors[i], rank)
|
|
# Verification
|
|
if rank == self.rank:
|
|
self.assertEqual(output_ts[i], expected)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_gather_checks(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
# init input
|
|
tensors = []
|
|
for device_id in local_device_ids:
|
|
tensors.append(torch.tensor([self.rank]).cuda(device_id))
|
|
|
|
# init output
|
|
output_ts = []
|
|
for idx in range(num_gpus):
|
|
gpu_idx = local_device_ids[idx]
|
|
output_ts.append([])
|
|
for rank in range(self.world_size):
|
|
output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
|
|
|
|
with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
|
|
opts = c10d.GatherOptions()
|
|
opts.rootRank = -1
|
|
pg.gather(output_ts, tensors, opts)
|
|
|
|
with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
|
|
pg.gather(output_ts, tensors, 0)
|
|
|
|
with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
|
|
opts = c10d.GatherOptions()
|
|
opts.rootRank = self.world_size
|
|
pg.gather(output_ts, tensors, opts)
|
|
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Tensor list must be nonempty"
|
|
):
|
|
opts = c10d.GatherOptions()
|
|
opts.rootRank = 0
|
|
pg.gather(output_ts, [], opts)
|
|
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Tensors must be on distinct GPU devices"
|
|
):
|
|
# init input
|
|
tensors2 = []
|
|
for device_id in local_device_ids:
|
|
tensors2.append(torch.tensor([self.rank]).cuda(device_id))
|
|
tensors2.append(torch.tensor([self.rank]).cuda(device_id))
|
|
|
|
opts = c10d.GatherOptions()
|
|
opts.rootRank = 0
|
|
pg.gather(output_ts, tensors2, opts)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_scatter_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
def scatter(output_t, input_t, rootRank):
|
|
opts = c10d.ScatterOptions()
|
|
opts.rootRank = rootRank
|
|
if rootRank == self.rank:
|
|
work = pg.scatter(output_t, input_t, opts)
|
|
else:
|
|
work = pg.scatter(output_t, [], opts)
|
|
work.wait()
|
|
|
|
# init output
|
|
tensors = []
|
|
for device_id in local_device_ids:
|
|
tensors.append(torch.tensor([-1]).cuda(device_id))
|
|
|
|
# init input
|
|
scatter_list = []
|
|
for idx in range(num_gpus):
|
|
gpu_idx = local_device_ids[idx]
|
|
scatter_list.append([])
|
|
for rank in range(self.world_size):
|
|
scatter_list[idx].append(torch.tensor([rank]).cuda(gpu_idx))
|
|
|
|
# test each rank to scatter
|
|
expected = [torch.tensor([self.rank])]
|
|
for rank in range(self.world_size):
|
|
scatter(tensors, scatter_list, rank)
|
|
self.assertEqual(expected, tensors)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_scatter_stress(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
def scatter(output_t, input_t, rootRank):
|
|
opts = c10d.ScatterOptions()
|
|
opts.rootRank = rootRank
|
|
if rootRank == self.rank:
|
|
work = pg.scatter(output_t, input_t, opts)
|
|
else:
|
|
work = pg.scatter(output_t, [], opts)
|
|
work.wait()
|
|
|
|
stress_length = 1000
|
|
|
|
# init output
|
|
tensors = []
|
|
for i in range(stress_length):
|
|
tensors.append([])
|
|
for device_id in local_device_ids:
|
|
tensors[i].append(torch.tensor([-1]).cuda(device_id))
|
|
|
|
# init input
|
|
scatter_list = []
|
|
for i in range(stress_length):
|
|
scatter_list.append([[] for _ in range(num_gpus)])
|
|
for idx, ls in enumerate(scatter_list[i]):
|
|
gpu_idx = local_device_ids[idx]
|
|
for rank in range(self.world_size):
|
|
ls.append(torch.tensor([rank]).cuda(gpu_idx))
|
|
|
|
|
|
# test each rank to scatter
|
|
expected = [torch.tensor([self.rank])]
|
|
for i in range(stress_length):
|
|
for rank in range(self.world_size):
|
|
scatter(tensors[i], scatter_list[i], rank)
|
|
# Verification
|
|
self.assertEqual(tensors[i], expected)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_scatter_checks(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
# init output
|
|
tensors = []
|
|
for device_id in local_device_ids:
|
|
tensors.append(torch.tensor([-1]).cuda(device_id))
|
|
|
|
# init input
|
|
scatter_list = []
|
|
for idx in range(num_gpus):
|
|
gpu_idx = local_device_ids[idx]
|
|
scatter_list.append([])
|
|
for rank in range(self.world_size):
|
|
scatter_list[idx].append(torch.tensor([rank]).cuda(gpu_idx))
|
|
|
|
with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
|
|
opts = c10d.ScatterOptions()
|
|
opts.rootRank = -1
|
|
pg.scatter(tensors, scatter_list, opts)
|
|
|
|
with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
|
|
pg.scatter(tensors, scatter_list, 0)
|
|
|
|
with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
|
|
opts = c10d.ScatterOptions()
|
|
opts.rootRank = self.world_size
|
|
pg.scatter(tensors, scatter_list, opts)
|
|
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Tensor list must be nonempty"
|
|
):
|
|
opts = c10d.ScatterOptions()
|
|
opts.rootRank = 0
|
|
pg.scatter([], scatter_list, opts)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_reduce_scatter_base_basics(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_id = self.rank_to_GPU[self.rank][0]
|
|
|
|
def reduce_scatter_base(output_t, input_t):
|
|
work = pg._reduce_scatter_base(output_t, input_t)
|
|
work.wait()
|
|
|
|
# anticpate an error
|
|
with self.assertRaisesRegex(
|
|
RuntimeError,
|
|
"input tensor must be the same size as output size times world size",
|
|
):
|
|
input_t = torch.tensor([self.rank]).cuda(local_device_id)
|
|
output_t = torch.empty((self.world_size + 1), dtype=input_t.dtype).cuda(
|
|
local_device_id
|
|
)
|
|
# fails the check because output_t is not correctly sized
|
|
reduce_scatter_base(output_t, input_t)
|
|
|
|
# anticpate an error
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "input tensor must be the same type as the outut tensor."
|
|
):
|
|
tensor = torch.tensor([self.rank], dtype=torch.float).cuda(local_device_id)
|
|
output_t = torch.empty((self.world_size + 1), dtype=torch.long).cuda(
|
|
local_device_id
|
|
)
|
|
# fails the check because the dtype is different
|
|
reduce_scatter_base(output_t, tensor)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_reduce_scatter_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
num_gpus = len(local_device_ids)
|
|
|
|
def reduce_scatter(outputs, input_lists, op):
|
|
opts = c10d.ReduceScatterOptions()
|
|
opts.reduceOp = op
|
|
work = pg.reduce_scatter(outputs, input_lists, opts)
|
|
work.wait()
|
|
|
|
output = [torch.tensor([0]).cuda(i) for i in local_device_ids]
|
|
|
|
# GPU/rank
|
|
# 0 [1], [2], [3], [4]
|
|
# 1 [2], [3], [4], [5]
|
|
# 2 [3], [4], [5], [6]
|
|
# 3 [4], [5], [6], [7]
|
|
|
|
# Sum
|
|
tensor_lists = []
|
|
input_per_gpu = []
|
|
|
|
for i in range(self.world_size):
|
|
input_per_gpu.append(torch.tensor([self.rank + i + 1]))
|
|
|
|
for gpu in local_device_ids:
|
|
tensor_lists.append([t.cuda(device=gpu) for t in input_per_gpu])
|
|
|
|
reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM)
|
|
|
|
for i in range(num_gpus):
|
|
expected = torch.tensor(
|
|
[
|
|
float((1 + self.world_size) * self.world_size / 2)
|
|
+ self.world_size * self.rank
|
|
])
|
|
|
|
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(expected, output[i])
|
|
|
|
# Min
|
|
reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN)
|
|
|
|
for i in range(num_gpus):
|
|
expected = torch.tensor([self.rank + 1 + i])
|
|
self.assertEqual(expected, output[i])
|
|
|
|
# Max
|
|
reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX)
|
|
|
|
for i in range(num_gpus):
|
|
expected = torch.tensor(
|
|
[self.rank + self.world_size + i]
|
|
)
|
|
self.assertEqual(expected, output[i])
|
|
|
|
# Product
|
|
reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT)
|
|
|
|
# math pakcage don't have math.perm until python 3.8, so
|
|
# we implement a naive version here.
|
|
def perm(n, k):
|
|
prod_val = n
|
|
for val in range(n - k + 1, n):
|
|
prod_val *= val
|
|
return prod_val
|
|
|
|
for i in range(num_gpus):
|
|
prod_val = perm(self.rank + self.world_size, self.world_size)
|
|
|
|
expected = torch.tensor([prod_val])
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(expected, output[i])
|
|
|
|
# Test the input params overridden scenarios, aka, when the input is
|
|
# a list and output is just one tensor.
|
|
# Sum
|
|
output_tensor = torch.empty_like(input_per_gpu[0][0]).cuda(self.rank)
|
|
input_list = [tensor[0].cuda(self.rank) for tensor in input_per_gpu]
|
|
pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.SUM).wait()
|
|
expected = torch.tensor(
|
|
float((1 + self.world_size) * self.world_size / 2) + self.world_size * self.rank
|
|
)
|
|
self.assertEqualIgnoreType(expected, output_tensor)
|
|
|
|
# Min
|
|
pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MIN).wait()
|
|
expected = torch.tensor(self.rank + 1)
|
|
self.assertEqualIgnoreType(expected, output_tensor)
|
|
|
|
# Max
|
|
pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MAX).wait()
|
|
expected = torch.tensor(self.rank + self.world_size)
|
|
self.assertEqualIgnoreType(expected, output_tensor)
|
|
|
|
# Product
|
|
pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.PRODUCT).wait()
|
|
prod_val = self.rank + 1
|
|
for k in range(1, self.world_size):
|
|
prod_val = prod_val * (self.rank + 1 + k)
|
|
expected = torch.tensor(prod_val)
|
|
self.assertEqualIgnoreType(expected, output_tensor)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_reduce_scatter_base_ops(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_id = self.rank_to_GPU[self.rank][0]
|
|
|
|
def reduce_scatter_base(output_t, input_t):
|
|
work = pg._reduce_scatter_base(output_t, input_t)
|
|
work.wait()
|
|
|
|
# reduce_scatter_base is GPU number agnostic.
|
|
# Each rank contribute one tensor regardless of GPU counts
|
|
output_t = torch.empty([1]).cuda(local_device_id)
|
|
tensor = torch.arange(self.world_size, dtype=output_t.dtype).cuda(local_device_id)
|
|
|
|
reduce_scatter_base(output_t, tensor)
|
|
|
|
# Verification
|
|
self.assertEqual(output_t[0], self.rank * self.world_size)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_barrier(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
pg = self._create_process_group_nccl(store, self.opts())
|
|
local_device_ids = self.rank_to_GPU[self.rank]
|
|
|
|
def allreduce(tensors):
|
|
opts = c10d.AllreduceOptions()
|
|
work = pg.allreduce(tensors, opts)
|
|
return work
|
|
|
|
# Making the collective to operate on
|
|
# 1, 2, 3, 4, .... len(local_device_ids) GPUs
|
|
tensors_list = [[] for _ in range(len(local_device_ids))]
|
|
|
|
for i in range(1, len(local_device_ids) + 1):
|
|
for j in range(i):
|
|
tensors_list[i - 1].append(torch.tensor([j + 1]).cuda(local_device_ids[j]))
|
|
|
|
works = []
|
|
for tensors in tensors_list:
|
|
work = allreduce(tensors)
|
|
works.append(work)
|
|
|
|
# Barrier will ensure that all previous work is completed
|
|
pg.barrier().wait()
|
|
|
|
for i in range(1, len(local_device_ids) + 1):
|
|
for j in range(i):
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(
|
|
torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j]
|
|
)
|
|
|
|
@requires_nccl()
|
|
@sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
|
|
def test_send_recv(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
self._create_process_group_nccl(store, self.opts())
|
|
device = self.rank_to_GPU[self.rank][0]
|
|
|
|
# Generate the same random tensor
|
|
torch.manual_seed(0)
|
|
send_tensor = torch.rand(10, 10, device=device)
|
|
if self.rank == 0:
|
|
dist.send(send_tensor, 1)
|
|
if self.rank == 1:
|
|
recv_tensor = torch.rand(10, 10, device=device)
|
|
dist.recv(recv_tensor, 0)
|
|
self.assertEqual(send_tensor, recv_tensor)
|
|
|
|
# Test with non-contiguous tensors.
|
|
send_tensor_view = send_tensor.t()
|
|
if self.rank == 0:
|
|
with self.assertRaisesRegex(RuntimeError, 'Tensors must be contiguous'):
|
|
dist.send(send_tensor_view, 1)
|
|
|
|
|
|
class DistributedDataParallelTest(
|
|
test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
|
|
):
|
|
def setUp(self):
|
|
super(DistributedDataParallelTest, self).setUp()
|
|
# NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
|
|
# that use NCCL_BLOCKING_WAIT will test it as expected.
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
|
self._spawn_processes()
|
|
|
|
def _get_process_group(self):
|
|
store = self._get_store()
|
|
return c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
def _test_nccl_backend(
|
|
self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
|
|
):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
self._test_ddp_with_process_group(
|
|
process_group, devices, device_ids, multi_device, gradient_as_bucket_view
|
|
)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_propagate_error_reason(self):
|
|
# Need to use NCCL_BLOCKING_WAIT and not ASYNC_ERROR_HANDLING,
|
|
# otherwise process will be taken down and we can't check for errors.
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
|
|
os.environ["NCCL_BLOCKING_WAIT"] = "1"
|
|
# TODO: smaller timeout can fail since PG NCCl does health check in
|
|
# constructor. Look into reducing this test's runtime.
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
# provide sufficient timeout to initialize NCCL comm.
|
|
pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size, timeout=timedelta(seconds=15))
|
|
pg_gloo = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
|
|
pg.barrier().wait(timedelta(seconds=5))
|
|
# Simulate stuckness in rank 0.
|
|
if self.rank == 0:
|
|
pg_gloo.barrier().wait()
|
|
inp = torch.ones(1).cuda(self.rank)
|
|
|
|
if self.rank != 0:
|
|
# Time out due to rank 0 not calling into allreduce.
|
|
with self.assertRaises(RuntimeError):
|
|
pg.allreduce([inp]).wait(timedelta(seconds=5))
|
|
|
|
# Now when nonzero rank attempts to use communicator, original failure reason should be logged.j
|
|
try:
|
|
pg.allreduce([torch.ones(2).cuda(self.rank)]).wait()
|
|
except RuntimeError as e:
|
|
self.assertTrue("timed out in call to wait()" in str(e))
|
|
self.assertTrue("TensorShape=[1]" in str(e))
|
|
else:
|
|
self.fail("Expected error to be raised!")
|
|
|
|
# Unblock rank 0
|
|
pg_gloo.barrier().wait()
|
|
|
|
# TODO: We can also test that if rank 0 attempts to use the communicator,
|
|
# then we should error out with the info that it was aborted due to
|
|
# timeout on another rank. Although this would only be the case after
|
|
# the watchdog has run on the rank, and there is no reliable way
|
|
# to confirm it has run.
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_backend_multi_device_ids_not_allowed(self):
|
|
int_devices = list(range(torch.cuda.device_count()))
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
with self.assertRaisesRegex(
|
|
ValueError, "device_ids can only be None or contain a single element."
|
|
):
|
|
self._test_nccl_backend(devices, int_devices)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_backend_single_device_module_device_ids_None(self):
|
|
self._test_nccl_backend(None, None)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_backend_single_device_module_empty_device_ids(self):
|
|
# This tests the backward compatibility of accepting an empty list as `device_ids`,
|
|
# although we no longer document this in favor of the default value of `None`,
|
|
# which is consistent with multi-device modules and CPU modules.
|
|
self._test_nccl_backend(None, [])
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_nccl_backend_multi_device_module_device_ids_None(self):
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
self._test_nccl_backend(devices, None, multi_device=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_backend_1gpu_module_device_ids_integer_list(self):
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
self._test_nccl_backend(devices, int_devices)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self):
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
self._test_nccl_backend(devices, devices)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_nccl_backend_2gpu_module(self):
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
self._test_nccl_backend(devices, None, multi_device=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(8)
|
|
def test_nccl_backend_4gpu_module(self):
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:4]
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
self._test_nccl_backend(devices, None, multi_device=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_ddp_multi_device_module_config(self):
|
|
gpus = gpus_for_rank(self.world_size)[self.rank]
|
|
|
|
self.assertTrue(len(gpus) >= 2, "expecting at least 2 gpus per process")
|
|
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
gpus = gpus[:2]
|
|
model = DoubleGpuNet(gpus)
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError,
|
|
"DistributedDataParallel device_ids and output_device arguments only work with "
|
|
"single-device/multiple-device GPU modules or CPU modules",
|
|
):
|
|
ddp_model = DistributedDataParallel(
|
|
model, output_device=gpus[1], process_group=process_group
|
|
)
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError, "device_ids can only be None or contain a single element."
|
|
):
|
|
ddp_model = DistributedDataParallel(
|
|
model, device_ids=gpus, process_group=process_group
|
|
)
|
|
|
|
with self.assertRaisesRegex(
|
|
ValueError, "input module must be on the same type of devices"
|
|
):
|
|
model.fc1 = model.fc1.cpu()
|
|
ddp_model = DistributedDataParallel(model, process_group=process_group)
|
|
|
|
model = model.cpu()
|
|
with self.assertRaisesRegex(
|
|
ValueError, "device_ids can only be None or contain a single element."
|
|
):
|
|
ddp_model = DistributedDataParallel(
|
|
model, device_ids=gpus, process_group=process_group
|
|
)
|
|
|
|
def _test_fp16(self, gradient_as_bucket_view=False):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
gpus = gpus_for_rank(self.world_size)[self.rank]
|
|
model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
|
|
nn.init.constant_(model.weight, 1)
|
|
ddp_model = DistributedDataParallel(
|
|
model,
|
|
device_ids=[gpus[0]],
|
|
process_group=process_group,
|
|
bucket_cap_mb=0.001,
|
|
gradient_as_bucket_view=gradient_as_bucket_view,
|
|
)
|
|
|
|
# Input 2**15, so that the gradients will overflow with a
|
|
# world_size of 2, unless we normalize the gradient by the
|
|
# world_size before the reduction
|
|
input = torch.tensor([[2 ** 15]]).cuda(gpus[0]).half()
|
|
|
|
# Step model
|
|
ddp_model.train()
|
|
output = ddp_model(input)
|
|
loss = output.sum()
|
|
loss.backward()
|
|
|
|
self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_fp16(self):
|
|
self._test_fp16()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_fp16_grad_is_view(self):
|
|
self._test_fp16(gradient_as_bucket_view=True)
|
|
|
|
def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False):
|
|
"""
|
|
Note: this test can be sped up by only running it on a CPU module
|
|
once DistributedDataParallel supports them.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
class ForwardReturnValueModule(nn.Module):
|
|
def __init__(self):
|
|
super(ForwardReturnValueModule, self).__init__()
|
|
self.fc1 = nn.Linear(2, 10, bias=False)
|
|
self.fc2 = nn.Linear(10, 4, bias=False)
|
|
self.fc3 = nn.Linear(4, 4, bias=False)
|
|
self.relu = nn.ReLU()
|
|
|
|
def forward(self, x, fn):
|
|
x = self.relu(self.fc1(x))
|
|
x = self.relu(self.fc2(x))
|
|
# The first softmax does NOT include fc3 in its autograd graph
|
|
# whereas the second softmax DOES. If we pass only the first
|
|
# tensor we see in the output to the reducer, it marks the
|
|
# gradient for fc3 as ready (because it doesn't show up). If
|
|
# downstream uses of this return value choose to differentiate
|
|
# against the second output tensor, it would still receive a
|
|
# gradient and a callback for this tensor, resulting in a crash.
|
|
return fn(
|
|
F.softmax(x, dim=1),
|
|
F.softmax(self.fc3(x), dim=1),
|
|
)
|
|
|
|
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
|
model = DistributedDataParallel(
|
|
ForwardReturnValueModule().float().to(device_id),
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
gradient_as_bucket_view=gradient_as_bucket_view,
|
|
)
|
|
|
|
batch_size = 4
|
|
criterion = nn.CrossEntropyLoss()
|
|
input = torch.rand([batch_size, 2], dtype=torch.float)
|
|
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
|
|
device_id
|
|
)
|
|
|
|
# Always run "backward" to ensure the reducer is called by autograd.
|
|
# If we don't correctly capture the output tensors from the return value,
|
|
# the reducer won't see a hook for the unused parameter, and throw an error.
|
|
# The correct capture is what we're testing in this function.
|
|
def test(box, unbox):
|
|
output = model(input, fn=box)
|
|
loss = criterion(unbox(output), target)
|
|
loss.backward()
|
|
|
|
# Test with identity return value
|
|
test(
|
|
box=lambda x, y: (x, y),
|
|
unbox=lambda obj: obj[1],
|
|
)
|
|
|
|
# Test with list return value
|
|
test(
|
|
box=lambda x, y: ["foo", x, "bar", y],
|
|
unbox=lambda obj: obj[3],
|
|
)
|
|
|
|
# Test with tuple return value
|
|
test(
|
|
box=lambda x, y: ("foo", x, "bar", y),
|
|
unbox=lambda obj: obj[3],
|
|
)
|
|
|
|
# Test with dict return value
|
|
test(
|
|
box=lambda x, y: {"foo": "bar", "a": x, "b": y},
|
|
unbox=lambda obj: obj["b"],
|
|
)
|
|
|
|
# Test with list with dict return value
|
|
test(
|
|
box=lambda x, y: ["foo", "bar", {"a": x, "b": y}],
|
|
unbox=lambda obj: obj[2]["b"],
|
|
)
|
|
|
|
# Test with dict with list return value
|
|
test(
|
|
box=lambda x, y: {"foo": "bar", "list": [0, x, 1, y]},
|
|
unbox=lambda obj: obj["list"][3],
|
|
)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_arbitrary_forward_return_value(self):
|
|
self._test_arbitrary_forward_return_value()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_arbitrary_forward_return_value_grad_is_view(self):
|
|
self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_with_lazy_parameters(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Modules with uninitialized parameters"
|
|
):
|
|
DistributedDataParallel(
|
|
torch.nn.LazyLinear(10), process_group=process_group
|
|
)
|
|
|
|
def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False):
|
|
"""
|
|
Note: this test can be sped up by only running it on a CPU module
|
|
once DistributedDataParallel supports them.
|
|
"""
|
|
torch.cuda.set_device(self.rank)
|
|
dist.init_process_group(
|
|
backend="nccl",
|
|
world_size=self.world_size,
|
|
rank=self.rank,
|
|
init_method=f"file://{self.file_name}",
|
|
)
|
|
process_group = c10d.distributed_c10d._get_default_group()
|
|
|
|
class FindUnusedParametersModule(nn.Module):
|
|
def __init__(self):
|
|
super(FindUnusedParametersModule, self).__init__()
|
|
self.fc1 = nn.Linear(2, 10, bias=False)
|
|
self.fc2 = nn.Linear(10, 4, bias=False)
|
|
self.fc3 = nn.Linear(4, 4, bias=False)
|
|
self.relu = nn.ReLU()
|
|
|
|
def forward(self, x):
|
|
x = self.relu(self.fc1(x))
|
|
x = self.relu(self.fc2(x))
|
|
# Return the fc3 module so that the caller can invoke it
|
|
# outside of the forward function. While this is bad practice,
|
|
# we can use it to trigger a reducer error.
|
|
return (F.softmax(x, dim=1), self.fc3)
|
|
|
|
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
|
batch_size = 4
|
|
criterion = nn.CrossEntropyLoss()
|
|
input = torch.rand([batch_size, 2], dtype=torch.float)
|
|
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
|
|
device_id
|
|
)
|
|
|
|
ddp_model = None
|
|
|
|
def test_find_unused_parameters(
|
|
find_unused_parameters, test_default=False, gradient_as_bucket_view=False
|
|
):
|
|
if test_default:
|
|
model = DistributedDataParallel(
|
|
FindUnusedParametersModule().float().to(device_id),
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
gradient_as_bucket_view=gradient_as_bucket_view,
|
|
)
|
|
else:
|
|
model = DistributedDataParallel(
|
|
FindUnusedParametersModule().float().to(device_id),
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
find_unused_parameters=find_unused_parameters,
|
|
gradient_as_bucket_view=gradient_as_bucket_view,
|
|
)
|
|
nonlocal ddp_model
|
|
ddp_model = model
|
|
|
|
output, fc3 = model(input)
|
|
output = fc3(output)
|
|
loss = criterion(output, target)
|
|
loss.backward()
|
|
|
|
# First test that finding unused params under these conditions is to
|
|
# trigger an error when `backward` is called (because fc3 is an unused
|
|
# parameter and will therefore be marked ready twice).
|
|
try:
|
|
test_find_unused_parameters(
|
|
True, gradient_as_bucket_view=gradient_as_bucket_view
|
|
)
|
|
except Exception as ex:
|
|
self.assertTrue(
|
|
str(ex).startswith(
|
|
"Expected to mark a variable ready only once.",
|
|
)
|
|
)
|
|
unused_index = 2
|
|
unused_index_str = f"Parameter at index {unused_index}"
|
|
model = ddp_model.module
|
|
for module_name, module in model.named_modules():
|
|
if module == model.fc3:
|
|
for parameter_name, _ in module.named_parameters(recurse=False):
|
|
unused_fqn = f"{module_name}.{parameter_name}"
|
|
# Only one such parameter in model.fc3, since bias=False
|
|
break
|
|
|
|
if dist.get_debug_level() != dist.DebugLevel.OFF:
|
|
unused_index_str += f" with name {unused_fqn}"
|
|
|
|
self.assertTrue(unused_index_str in str(ex))
|
|
else:
|
|
self.fail("Expected exception")
|
|
|
|
dist.barrier(process_group)
|
|
|
|
# Then test that the default behavior can be overridden by setting
|
|
# `find_unused_parameters=False`.
|
|
try:
|
|
test_find_unused_parameters(
|
|
False, gradient_as_bucket_view=gradient_as_bucket_view
|
|
)
|
|
except Exception as ex:
|
|
self.fail("Unexpected exception: %s" % ex)
|
|
|
|
# Test find_unused_parameters defaults to False
|
|
try:
|
|
test_find_unused_parameters(
|
|
True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view
|
|
)
|
|
except Exception as ex:
|
|
self.fail("Unexpected exception: %s" % ex)
|
|
|
|
# TODO: Combine the following tests once https://github.com/pytorch/pytorch/issues/55967
|
|
# is resolved.
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["DETAIL"])
|
|
def test_find_unused_parameters_kwarg_debug_detail(self):
|
|
self._test_find_unused_parameters_kwarg()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["INFO"])
|
|
def test_find_unused_parameters_kwarg_debug_info(self):
|
|
self._test_find_unused_parameters_kwarg()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["OFF"])
|
|
def test_find_unused_parameters_kwarg_debug_off(self):
|
|
self._test_find_unused_parameters_kwarg()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["DETAIL"])
|
|
def test_find_unused_parameters_kwarg_grad_is_view_debug_detail(self):
|
|
self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["INFO"])
|
|
def test_find_unused_parameters_kwarg_grad_is_view_debug_info(self):
|
|
self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["OFF"])
|
|
def test_find_unused_parameters_kwarg_grad_is_view_debug_off(self):
|
|
self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True)
|
|
|
|
def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False):
|
|
"""
|
|
Note: this test can be sped up by only running it on a CPU module
|
|
once DistributedDataParallel supports them.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
class MultipleOutputModule(nn.Module):
|
|
def __init__(self):
|
|
super(MultipleOutputModule, self).__init__()
|
|
|
|
def define_module():
|
|
return nn.Sequential(
|
|
nn.Linear(2, 10, bias=False),
|
|
nn.ReLU(),
|
|
nn.Linear(10, 4, bias=False),
|
|
nn.ReLU(),
|
|
)
|
|
|
|
self.module0 = define_module()
|
|
self.module1 = define_module()
|
|
|
|
def forward(self, x):
|
|
return (
|
|
F.softmax(self.module0(x), dim=1),
|
|
F.softmax(self.module1(x), dim=1),
|
|
)
|
|
|
|
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
|
model = DistributedDataParallel(
|
|
MultipleOutputModule().float().to(device_id),
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
gradient_as_bucket_view=gradient_as_bucket_view,
|
|
)
|
|
|
|
batch_size = 4
|
|
criterion = nn.CrossEntropyLoss()
|
|
input = torch.rand([batch_size, 2], dtype=torch.float)
|
|
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
|
|
device_id
|
|
)
|
|
|
|
# Compute loss and gradients for both outputs
|
|
output1, output2 = model(input)
|
|
loss1 = criterion(output1, target)
|
|
loss1.backward()
|
|
loss2 = criterion(output2, target)
|
|
loss2.backward()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_multiple_outputs_multiple_backward(self):
|
|
self._test_multiple_outputs_multiple_backward()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_multiple_outputs_multiple_backward_grad_is_view(self):
|
|
self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_no_grad(self):
|
|
"""
|
|
Note: this test can be sped up by only running it on a CPU module
|
|
once DistributedDataParallel supports them.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
class NoGradModule(nn.Module):
|
|
def __init__(self):
|
|
super(NoGradModule, self).__init__()
|
|
self.fc1 = nn.Linear(2, 10, bias=False)
|
|
self.fc2 = nn.Linear(10, 4, bias=False)
|
|
self.relu = nn.ReLU()
|
|
|
|
def forward(self, x):
|
|
x = self.relu(self.fc1(x))
|
|
x = self.relu(self.fc2(x))
|
|
return F.softmax(x, dim=1)
|
|
|
|
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
|
model = DistributedDataParallel(
|
|
NoGradModule().float().to(device_id),
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
)
|
|
|
|
batch_size = 4
|
|
input = torch.rand([batch_size, 2], dtype=torch.float)
|
|
|
|
def check_no_grads():
|
|
for p in model.parameters():
|
|
self.assertTrue(p.requires_grad)
|
|
self.assertIsNone(p.grad)
|
|
|
|
# After initialization, no parameter has their gradient set.
|
|
check_no_grads()
|
|
|
|
# Run `forward` function with torch.no_grad()
|
|
with torch.no_grad():
|
|
output = model(input)
|
|
self.assertTrue(isinstance(output, torch.Tensor))
|
|
|
|
# No parameter should have their gradient set.
|
|
check_no_grads()
|
|
|
|
def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False):
|
|
# This is NOT the recommended way to implement accumulating grads, but
|
|
# we would like to make sure DDP does not mess up with the underlying
|
|
# module.
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
|
|
devices = [torch.device("cuda:" + str(i)) for i in int_devices]
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
global_batch_size = self.world_size
|
|
|
|
model, ddp_model, input, target = self._prepare_single_device_module(
|
|
process_group, devices, devices, global_batch_size, gradient_as_bucket_view
|
|
)
|
|
|
|
def step_model(model, input, target):
|
|
model.train()
|
|
output = model(input)
|
|
loss = F.mse_loss(output, target.to(output.device))
|
|
loss.backward()
|
|
|
|
# ensure accumulate grads works with no_grad
|
|
with torch.no_grad():
|
|
ddp_model.train()
|
|
ddp_model.module(input)
|
|
|
|
# Check two model parameters over 4 iterations.
|
|
# Use 4 iterations because we alternate between reducing and
|
|
# not reducing and want to make sure we switch both ways.
|
|
for iteration in range(4):
|
|
step_model(model, input, target)
|
|
|
|
if iteration % 2 == 0:
|
|
# Skip gradients sync without calling prepare_for_backward
|
|
step_model(
|
|
ddp_model.module,
|
|
input[self.rank : (self.rank + 1)],
|
|
target[self.rank : (self.rank + 1)],
|
|
)
|
|
for i, j in zip(model.parameters(), ddp_model.parameters()):
|
|
self.assertNotEqual(i.grad, j.grad)
|
|
else:
|
|
step_model(
|
|
ddp_model,
|
|
input[self.rank : (self.rank + 1)],
|
|
target[self.rank : (self.rank + 1)],
|
|
)
|
|
for i, j in zip(model.parameters(), ddp_model.parameters()):
|
|
# TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
|
|
self.assertEqualIgnoreType(i.grad, j.grad, rtol=1.3e-06, atol=5e-5)
|
|
|
|
# Shuffle the input so that DDP input is different
|
|
torch.manual_seed(1337 + iteration)
|
|
input = input[torch.randperm(global_batch_size)]
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_accumulate_gradients_module(self):
|
|
self._test_accumulate_gradients_module()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_accumulate_gradients_module_with_grad_is_view(self):
|
|
self._test_accumulate_gradients_module(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_failure_recovery(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
# need to create a separate file for the recovered FileStore, because
|
|
# the original one will be deleted when destructing the first FileStore.
|
|
recovery_filename = self.file_name + "_recovery"
|
|
|
|
if self.rank == 0:
|
|
# the file will be deleted by the recovered FileStore
|
|
open(recovery_filename, "w").close()
|
|
|
|
# not necessary to run barrier here, as DDP will synchronize
|
|
|
|
class TestModel(nn.Module):
|
|
def __init__(self):
|
|
super(TestModel, self).__init__()
|
|
self.fc1 = nn.Linear(2, 10, bias=False)
|
|
self.fc2 = nn.Linear(10, 4, bias=False)
|
|
self.relu = nn.ReLU()
|
|
|
|
def forward(self, x):
|
|
x = self.relu(self.fc1(x))
|
|
x = self.relu(self.fc2(x))
|
|
return F.softmax(x, dim=1)
|
|
|
|
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
|
model = TestModel().float().to(device_id)
|
|
ddp = DistributedDataParallel(
|
|
model,
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
)
|
|
|
|
batch_size = 4
|
|
criterion = nn.CrossEntropyLoss()
|
|
input = torch.rand([batch_size, 2], dtype=torch.float)
|
|
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
|
|
device_id
|
|
)
|
|
|
|
for _ in range(6):
|
|
output = ddp(input)
|
|
loss = criterion(output, target)
|
|
loss.backward()
|
|
|
|
del ddp
|
|
del process_group
|
|
del store # this will delete self.file_name
|
|
|
|
store = c10d.FileStore(recovery_filename, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
ddp = DistributedDataParallel(
|
|
model,
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
)
|
|
|
|
input = torch.rand([batch_size, 2], dtype=torch.float)
|
|
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(
|
|
device_id
|
|
)
|
|
for _ in range(6):
|
|
output = ddp(input)
|
|
loss = criterion(output, target)
|
|
loss.backward()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_pass_default_pg(self):
|
|
dist.init_process_group(
|
|
"nccl",
|
|
init_method=f"file://{self.file_name}",
|
|
world_size=self.world_size,
|
|
rank=self.rank,
|
|
)
|
|
|
|
default_pg = c10d.distributed_c10d._get_default_group()
|
|
dist.destroy_process_group(default_pg)
|
|
self.assertFalse(dist.is_initialized())
|
|
|
|
def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
global_batch_size = local_batch_size * self.world_size
|
|
|
|
# Carry out some trials with small buckets and some with big buckets.
|
|
bucketsizes = (0.000001, 25)
|
|
# Tuples of lists. Each list describes per-layer characteristics for one trial.
|
|
layer_formats = (
|
|
[torch.contiguous_format] * 4,
|
|
[torch.channels_last] * 2 + [torch.contiguous_format] * 2,
|
|
[torch.channels_last] * 4,
|
|
)
|
|
layer_dtypes = (
|
|
[torch.float] * 4,
|
|
[torch.float] * 2 + [torch.half] * 2,
|
|
[torch.half] * 4,
|
|
)
|
|
|
|
input_dev = layer_devs[0] if isinstance(layer_devs, list) else layer_devs
|
|
target_dev = layer_devs[-1] if isinstance(layer_devs, list) else layer_devs
|
|
input = torch.randn(
|
|
(global_batch_size, 8, 8, 8), device=input_dev, dtype=torch.float
|
|
)
|
|
target = torch.randn(
|
|
(global_batch_size, 8, 4, 4), device=target_dev, dtype=torch.float
|
|
)
|
|
local_batch_start = self.rank * local_batch_size
|
|
local_batch_end = (self.rank + 1) * local_batch_size
|
|
|
|
# Reducer.cpp sneakily creates one "initial bucket" that ignores the "bucket_cap_mb"
|
|
# argument. The following makes sure the initial bucket also complies.
|
|
@contextmanager
|
|
def first_bucket_size(ddp_bucket_mb):
|
|
old_DEFAULT_FIRST_BUCKET_BYTES = dist._DEFAULT_FIRST_BUCKET_BYTES
|
|
dist._DEFAULT_FIRST_BUCKET_BYTES = int(ddp_bucket_mb * 1.0e6)
|
|
try:
|
|
yield
|
|
finally:
|
|
dist._DEFAULT_FIRST_BUCKET_BYTES = old_DEFAULT_FIRST_BUCKET_BYTES
|
|
|
|
with torch.backends.cudnn.flags(
|
|
enabled=True, deterministic=True, benchmark=False
|
|
):
|
|
for formats, dtypes, bucketsize in product(
|
|
layer_formats, layer_dtypes, bucketsizes
|
|
):
|
|
with first_bucket_size(bucketsize):
|
|
model_msg = (
|
|
"rank = {} formats = {} dtypes = {} bucketsize = {} ".format(
|
|
self.rank, formats, dtypes, bucketsize
|
|
)
|
|
)
|
|
try:
|
|
m = ConvNet(layer_devs, formats, dtypes)
|
|
m_ddp = DistributedDataParallel(
|
|
copy.deepcopy(m),
|
|
device_ids=replica_devices,
|
|
process_group=process_group,
|
|
bucket_cap_mb=bucketsize,
|
|
)
|
|
opt = torch.optim.SGD(m.parameters(), lr=0.1)
|
|
opt_ddp = torch.optim.SGD(m_ddp.parameters(), lr=0.1)
|
|
has_half = any(p.dtype is torch.half for p in m.parameters())
|
|
tol = 1.0e-3 if has_half else 1.0e-5
|
|
except BaseException:
|
|
# Prints case-specific debugging info to narrow down failing case.
|
|
print(
|
|
"Caught exception during model creation for " + model_msg,
|
|
flush=True,
|
|
)
|
|
raise
|
|
# 3 iters: First iter creates grads, second iter retests after rebucketing,
|
|
# third iter tries zeroed grads.
|
|
for it in range(3):
|
|
iter_msg = "iter = {} ".format(it) + model_msg
|
|
named_msg = iter_msg
|
|
try:
|
|
F.mse_loss(m(input).float(), target).backward()
|
|
F.mse_loss(
|
|
m_ddp(input[local_batch_start:local_batch_end]).float(),
|
|
target[local_batch_start:local_batch_end],
|
|
).backward()
|
|
for i, ((layer_name, m_child), m_ddp_child) in enumerate(
|
|
zip(m.named_children(), m_ddp.module.children())
|
|
):
|
|
named_msg = layer_name + ".weight" + " " + iter_msg
|
|
self.assertTrue(
|
|
m_child.weight.grad.is_contiguous(
|
|
memory_format=formats[i]
|
|
),
|
|
named_msg,
|
|
)
|
|
self.assertTrue(
|
|
m_ddp_child.weight.grad.is_contiguous(
|
|
memory_format=formats[i]
|
|
),
|
|
named_msg,
|
|
)
|
|
for j, ((param_name, p), p_ddp) in enumerate(
|
|
zip(
|
|
m_child.named_parameters(),
|
|
m_ddp_child.parameters(),
|
|
)
|
|
):
|
|
named_msg = (
|
|
layer_name + "." + param_name + " " + iter_msg
|
|
)
|
|
self.assertEqual(
|
|
p.grad, p_ddp.grad, rtol=tol, atol=tol
|
|
)
|
|
opt.step()
|
|
opt_ddp.step()
|
|
if it == 0:
|
|
for p, p_ddp in zip(m.parameters(), m_ddp.parameters()):
|
|
p.grad = None
|
|
p_ddp.grad = None
|
|
else:
|
|
m.zero_grad()
|
|
m_ddp.zero_grad()
|
|
except BaseException:
|
|
# Makes sure we still get info if an error occurred somewhere other than the asserts.
|
|
print(
|
|
"Caught exception during iterations at " + named_msg,
|
|
flush=True,
|
|
)
|
|
raise
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@skip_if_rocm
|
|
def test_grad_layout_1devicemodule_1replicaperprocess(self):
|
|
dev0 = torch.device("cuda:" + str(gpus_for_rank(self.world_size)[self.rank][0]))
|
|
# Tells DDP to use just one device.
|
|
replica_devices = [dev0]
|
|
# Tells _test_grad_layout to construct ConvNet with all layers on this process's first assigned device.
|
|
layer_devs = dev0
|
|
local_batch_size = 8
|
|
self._test_grad_layout(replica_devices, layer_devs, local_batch_size)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
@skip_if_rocm
|
|
def test_grad_layout_2devicemodule(self):
|
|
int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
|
|
dev0 = torch.device("cuda:" + str(int_devices[0]))
|
|
dev1 = torch.device("cuda:" + str(int_devices[1]))
|
|
# DDP's default behavior for a multi-device module is "don't replicate."
|
|
replica_devices = None
|
|
# Tells _test_grad_layout to constructs this process's ConvNet on 2 devices, with 2 layers on each device.
|
|
layer_devs = [dev0] * 2 + [dev1] * 2
|
|
local_batch_size = 8
|
|
self._test_grad_layout(replica_devices, layer_devs, local_batch_size)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_param_layout_mismatch_error(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
dev0 = torch.device("cuda:" + str(gpus_for_rank(self.world_size)[self.rank][0]))
|
|
layer_devs = dev0
|
|
layer_formats = (
|
|
[torch.contiguous_format] * 4
|
|
if self.rank == 0
|
|
else [torch.channels_last] * 4
|
|
)
|
|
layer_dtypes = [torch.float] * 4
|
|
|
|
m = ConvNet(layer_devs, layer_formats, layer_dtypes)
|
|
if self.rank == 0:
|
|
m_ddp = DistributedDataParallel(
|
|
m, device_ids=[dev0], process_group=process_group
|
|
)
|
|
else:
|
|
with self.assertRaisesRegex(
|
|
RuntimeError,
|
|
".* appears not to match strides of the same param in process 0",
|
|
):
|
|
m_ddp = DistributedDataParallel(
|
|
m, device_ids=[dev0], process_group=process_group
|
|
)
|
|
|
|
def _gpu_model_with_ddp_comm_hook(
|
|
self,
|
|
process_group,
|
|
hook=None,
|
|
gradient_as_bucket_view=False,
|
|
state=None,
|
|
static_graph=False,
|
|
):
|
|
device_id = gpus_for_rank(self.world_size)[self.rank][0]
|
|
gpu_model = DistributedDataParallel(
|
|
ModuleForDdpCommHook().to(device_id),
|
|
device_ids=[device_id],
|
|
process_group=process_group,
|
|
gradient_as_bucket_view=gradient_as_bucket_view,
|
|
static_graph=static_graph,
|
|
)
|
|
|
|
# Register a DDP communication hook if any.
|
|
if hook is not None:
|
|
gpu_model.register_comm_hook(state, hook)
|
|
|
|
return gpu_model
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_comm_hook_future_passing_gpu_nccl(self):
|
|
"""
|
|
This unit test verifies whether the Future object is passed properly using nccl backend.
|
|
The hook callback function creates a Future object and sets a value to it.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
# Get GPU model with simple_hook registered.
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, self._simple_hook)
|
|
|
|
# check whether the grads are equal to what simple_hook's then callback returns.
|
|
# without the comm_hook, result would be 0.25 * torch.ones(2, 2).
|
|
self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2))
|
|
|
|
def _test_ddp_comm_hook_allreduce_hook_nccl(
|
|
self, gradient_as_bucket_view=False, static_graph=False
|
|
):
|
|
"""
|
|
This unit test verifies whether a DDP communication hook that just calls
|
|
allreduce gives the same result with the case of no hook registered.
|
|
Without the then callback, the future_value in reducer is no longer
|
|
a PyObject, and this unit test verifies future_value is properly checked.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
def allreduce_hook(
|
|
state: object, bucket: dist.GradBucket
|
|
) -> torch.futures.Future[torch.Tensor]:
|
|
tensors = [bucket.buffer() / self.world_size]
|
|
return (
|
|
process_group.allreduce(tensors)
|
|
.get_future()
|
|
.then(lambda fut: fut.value()[0])
|
|
)
|
|
|
|
# Get GPU model with allreduce_hook registered.
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(
|
|
process_group, allreduce_hook, gradient_as_bucket_view, static_graph
|
|
)
|
|
|
|
# check whether the grads are equal to what DDP without hook would return.
|
|
self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
|
|
|
|
def _test_default_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False):
|
|
"""
|
|
This unit test verifies whether default Python DDP communication hooks ALLREDUCE, FP16_COMPRESS
|
|
and BF16_COMPRESS, can give the same result with the case of no hook registered.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
# For these default DDP comm hooks, the only state is process group.
|
|
state = process_group
|
|
hook_options = [default.allreduce_hook, default.fp16_compress_hook]
|
|
if (
|
|
not TEST_WITH_ROCM
|
|
and BFLOAT16_AVAILABLE
|
|
and c10d.is_nccl_available()
|
|
and torch.cuda.nccl.version() >= (2, 10)
|
|
):
|
|
hook_options.append(default.bf16_compress_hook)
|
|
for hook in hook_options:
|
|
# Get GPU model with the hook registered.
|
|
# The first arg 'process_group' is used for initializing the test environment,
|
|
# so it cannot be replaced by 'state', although they have the same value.
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(
|
|
process_group, hook, gradient_as_bucket_view, state
|
|
)
|
|
|
|
# check whether the grads are equal to what DDP without hook would return.
|
|
self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
|
|
|
|
def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False):
|
|
"""
|
|
This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with
|
|
the FP16_WRAPPER can give the same result as when there is no hook registered.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
powerSGD_state = powerSGD.PowerSGDState(process_group=process_group)
|
|
|
|
hook_args = [
|
|
(powerSGD.powerSGD_hook, powerSGD_state),
|
|
(default.allreduce_hook, process_group),
|
|
]
|
|
|
|
for hook, state in hook_args:
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(
|
|
process_group,
|
|
default.fp16_compress_wrapper(hook),
|
|
gradient_as_bucket_view,
|
|
state,
|
|
)
|
|
|
|
# check whether the grads are equal to what DDP without hook would return.
|
|
self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
|
|
|
|
def _test_bf16_compress_wrapper(self, gradient_as_bucket_view=False):
|
|
"""
|
|
This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with
|
|
the BF16_WRAPPER can give the same result as when there is no hook registered.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
powerSGD_state = powerSGD.PowerSGDState(process_group=process_group)
|
|
|
|
hook_args = [
|
|
(powerSGD.powerSGD_hook, powerSGD_state),
|
|
(default.allreduce_hook, process_group),
|
|
]
|
|
|
|
for hook, state in hook_args:
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(
|
|
process_group,
|
|
default.bf16_compress_wrapper(hook),
|
|
gradient_as_bucket_view,
|
|
state,
|
|
)
|
|
|
|
# check whether the grads are equal to what DDP without hook would return.
|
|
self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
|
|
|
|
def _test_powerSGD_ddp_comm_hook_nccl(self, gradient_as_bucket_view=False):
|
|
"""
|
|
This unit test verifies whether Python DDP communication hook POWER_SGD
|
|
can give the same result with the case of no hook registered.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
# Get GPU model with the hook registered.
|
|
# Test the hook with different algorithmic configs.
|
|
for use_error_feedback, warm_start, batch_tensors_with_same_shape in product(
|
|
[True, False], [True, False], [True, False],
|
|
):
|
|
state = powerSGD.PowerSGDState(
|
|
process_group=process_group,
|
|
matrix_approximation_rank=1,
|
|
use_error_feedback=use_error_feedback,
|
|
warm_start=warm_start,
|
|
batch_tensors_with_same_shape=batch_tensors_with_same_shape,
|
|
)
|
|
for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]:
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(
|
|
process_group, hook, gradient_as_bucket_view, state
|
|
)
|
|
|
|
# check whether the grads are equal to what DDP without hook would return.
|
|
self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
|
|
|
|
def _test_builtin_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False):
|
|
"""
|
|
This unit test verifies whether built-in C++ DDP communication hooks ALLREDUCE and FP16_COMPRESS
|
|
can give the same result with the case of no hook registered.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
for comm_hook_type in [
|
|
dist.BuiltinCommHookType.ALLREDUCE,
|
|
dist.BuiltinCommHookType.FP16_COMPRESS,
|
|
]:
|
|
# Get GPU model with the built-in communication hook.
|
|
gpu_model = self._gpu_model_with_builtin_ddp_comm_hook(
|
|
process_group, comm_hook_type, gradient_as_bucket_view
|
|
)
|
|
|
|
# check whether the grads are equal to what DDP without hook would return.
|
|
self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_comm_hook_allreduce_hook_nccl(self):
|
|
self._test_ddp_comm_hook_allreduce_hook_nccl()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_default_ddp_comm_hooks_nccl(self):
|
|
self._test_default_ddp_comm_hooks_nccl()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_fp16_compress_wrapper_nccl(self):
|
|
self._test_fp16_compress_wrapper()
|
|
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS")
|
|
@sandcastle_skip_if(
|
|
not BFLOAT16_AVAILABLE,
|
|
"BFloat16 is only supported by CUDA 11+",
|
|
)
|
|
@skip_if_lt_x_gpu(2)
|
|
@skip_if_rocm
|
|
def test_bf16_compress_wrapper_nccl(self):
|
|
self._test_bf16_compress_wrapper()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_builtin_ddp_comm_hooks_nccl(self):
|
|
self._test_builtin_ddp_comm_hooks_nccl()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_powerSGD_ddp_comm_hook_nccl(self):
|
|
self._test_powerSGD_ddp_comm_hook_nccl()
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self):
|
|
self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_comm_hook_allreduce_hook_nccl_static_graph(self):
|
|
self._test_ddp_comm_hook_allreduce_hook_nccl(static_graph=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_default_ddp_comm_hooks_nccl_is_view(self):
|
|
self._test_default_ddp_comm_hooks_nccl(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_fp16_compress_wrapper_is_view(self):
|
|
self._test_fp16_compress_wrapper(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS")
|
|
@sandcastle_skip_if(
|
|
not BFLOAT16_AVAILABLE,
|
|
"BFloat16 is only supported by CUDA 11+",
|
|
)
|
|
@skip_if_lt_x_gpu(2)
|
|
@skip_if_rocm
|
|
def test_bf16_compress_wrapper_is_view(self):
|
|
self._test_bf16_compress_wrapper(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_builtin_ddp_comm_hooks_nccl_grad_is_view(self):
|
|
self._test_builtin_ddp_comm_hooks_nccl(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_powerSGD_ddp_comm_hook_nccl_grad_is_view(self):
|
|
self._test_powerSGD_ddp_comm_hook_nccl(gradient_as_bucket_view=True)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self):
|
|
"""
|
|
This unit test verifies whether a DDP communication hook that calls allreduce and then
|
|
multiplies the result by ten and divides by two gives the expected result.
|
|
"""
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
def allreduce_with_then_hook(
|
|
state: object, bucket: dist.GradBucket
|
|
) -> torch.futures.Future[torch.Tensor]:
|
|
tensors = [bucket.buffer() / self.world_size]
|
|
fut = process_group.allreduce(tensors).get_future()
|
|
|
|
def mult(fut):
|
|
# Multiply the result by 10.
|
|
return 10 * fut.value()[0]
|
|
|
|
def div(fut):
|
|
# Divide the result by 2.
|
|
return 0.5 * fut.value()
|
|
|
|
return fut.then(mult).then(div)
|
|
|
|
# Get GPU model with allreduce_with_then_hook registered.
|
|
gpu_model = self._gpu_model_with_ddp_comm_hook(
|
|
process_group, allreduce_with_then_hook
|
|
)
|
|
|
|
# check whether the grads are equal to what allreduce returns multuplied by 5.
|
|
# without the comm_hook, result would be still 0.25 * torch.ones(2, 2).
|
|
self._run_and_verify_hook(gpu_model, 8, 1.25 * torch.ones(2, 2))
|
|
|
|
class AcceptsParam(torch.nn.Module):
|
|
def __init__(self, p, factor):
|
|
super().__init__()
|
|
self.a = p
|
|
self.f = factor
|
|
|
|
def forward(self, input):
|
|
return input + self.a * self.f
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_ddp_weight_sharing(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
size = 2048 * 2048
|
|
dev = self.rank
|
|
world = self.world_size
|
|
|
|
p = torch.nn.Parameter(torch.randn(size, requires_grad=True))
|
|
|
|
for try_set_to_none, use_bucket_view in product((False, True), (False, True)):
|
|
m = torch.nn.Sequential(
|
|
self.AcceptsParam(p, dev + 1), self.AcceptsParam(p, dev + 1)
|
|
).cuda(dev)
|
|
|
|
m = torch.nn.parallel.DistributedDataParallel(
|
|
m,
|
|
bucket_cap_mb=1,
|
|
gradient_as_bucket_view=use_bucket_view,
|
|
device_ids=[dev],
|
|
process_group=process_group,
|
|
)
|
|
|
|
for i in range(3):
|
|
m.zero_grad(set_to_none=try_set_to_none)
|
|
m(1).sum().backward()
|
|
|
|
# Each param value is multiplied by "rank + 1" twice in forward, so the grad
|
|
# values produced by a particular rank should be 2. * (rank + 1).
|
|
# Summing these over ranks and dividing by world size gives the expected result:
|
|
analytic = torch.full_like(
|
|
p, 2.0 * (world * (world + 1.0) / 2.0) / world, device=dev
|
|
)
|
|
for name, p in m.named_parameters():
|
|
self.assertEqual(
|
|
p.grad,
|
|
analytic,
|
|
"mismatch at "
|
|
+ name
|
|
+ ".grad for "
|
|
+ "set_to_none = {}, use_bucket_view = {}".format(
|
|
try_set_to_none, use_bucket_view
|
|
),
|
|
)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_channels_last_contig(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
device = torch.device(f"cuda:{self.rank}")
|
|
tensor = torch.ones((2, 16, 768, 1152), dtype=torch.float32, device=device).to(memory_format=torch.channels_last)
|
|
process_group.broadcast([tensor]).wait()
|
|
|
|
|
|
|
|
class NcclErrorHandlingTest(MultiProcessTestCase):
|
|
def setUp(self):
|
|
super(NcclErrorHandlingTest, self).setUp()
|
|
# Need to skip return code checking for these tests since the child
|
|
# processes don't exit cleanly.
|
|
self.skip_return_code_checks = [
|
|
self.test_nccl_errors_blocking_abort.__wrapped__,
|
|
self.test_nccl_errors_blocking_sigkill.__wrapped__,
|
|
self.test_nccl_errors_blocking_sigterm.__wrapped__,
|
|
self.test_nccl_errors_blocking_nonzero_exit.__wrapped__,
|
|
]
|
|
# NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
|
|
# that use NCCL_BLOCKING_WAIT will test it as expected.
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
|
self._spawn_processes()
|
|
|
|
def tearDown(self):
|
|
super(NcclErrorHandlingTest, self).tearDown()
|
|
try:
|
|
os.remove(self.file_name)
|
|
except OSError:
|
|
pass
|
|
|
|
@property
|
|
def op_timeout_sec(self):
|
|
return 1
|
|
|
|
@property
|
|
def world_size(self):
|
|
return 3
|
|
|
|
@property
|
|
def blocking_wait_error_msg(self):
|
|
return "Caught collective operation timeout"
|
|
|
|
def _run_all_reduce(self, pg):
|
|
pg.allreduce(torch.rand(10).cuda(self.rank))
|
|
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
@skip_if_rocm
|
|
@sandcastle_skip("Test does not pass when run locally")
|
|
def test_nccl_errors_nonblocking(self):
|
|
# Note: we unset and restore NCCL_ASYNC_ERROR_HANDLING for this test
|
|
# since test_c10d_common runs with async error handling by default, but this
|
|
# tests behavior when it is not enabled.
|
|
prev_nccl_async_error_handling = os.environ.get(
|
|
"NCCL_ASYNC_ERROR_HANDLING", None
|
|
)
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
process_group.allreduce(torch.rand(10).cuda(self.rank))
|
|
if self.rank == 0:
|
|
# This allreduce does not block Python thread as allreduce enqueues
|
|
# the cuda operation, and then wait only blocks the current cuda
|
|
# stream.
|
|
work = process_group.allreduce(torch.rand(10).cuda(self.rank))
|
|
work.wait()
|
|
|
|
# Now the work scheduled next should hang forever since the previous
|
|
# allreduce will never complete.
|
|
t = threading.Thread(target=self._run_all_reduce, args=(process_group,))
|
|
t.daemon = True
|
|
t.start()
|
|
t.join(int(get_timeout(self.id()) / 5))
|
|
self.assertTrue(t.is_alive())
|
|
|
|
if prev_nccl_async_error_handling is not None:
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = prev_nccl_async_error_handling
|
|
|
|
def _test_nccl_errors_blocking(self, func):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(
|
|
store,
|
|
self.rank,
|
|
self.world_size,
|
|
timeout=timedelta(seconds=10),
|
|
)
|
|
process_group.allreduce(torch.rand(10).cuda(self.rank))
|
|
if self.rank == 0:
|
|
work = process_group.allreduce(torch.rand(10).cuda(self.rank))
|
|
with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg):
|
|
# Operation would time out in blocking mode.
|
|
work.wait(timeout=timedelta(seconds=self.op_timeout_sec))
|
|
# Run some GPU operations to make sure cuda has not gotten stuck.
|
|
# It was observed cuda could get stuck if NCCL communicators were
|
|
# not properly aborted before throwing RuntimeError.
|
|
a = torch.rand(10).cuda(self.rank)
|
|
elif self.rank == 1:
|
|
# Clean up structures (ex: files for FileStore before going down)
|
|
del process_group
|
|
func()
|
|
else:
|
|
# Wait for timeout
|
|
time.sleep(2 * self.op_timeout_sec)
|
|
|
|
# Now verify communicators on this rank have been aborted by the watchdog thread.
|
|
self._wait_for_comm_abort(process_group)
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
@skip_if_rocm
|
|
def test_nccl_errors_blocking_clean_exit(self):
|
|
self._test_nccl_errors_blocking(lambda: sys.exit(0))
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
@skip_if_rocm
|
|
def test_nccl_errors_blocking_nonzero_exit(self):
|
|
self._test_nccl_errors_blocking(lambda: sys.exit(1))
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
@skip_if_rocm
|
|
@sandcastle_skip(
|
|
"Frequently times out see https://github.com/pytorch/pytorch/issues/58920"
|
|
)
|
|
def test_nccl_errors_blocking_abort(self):
|
|
self._test_nccl_errors_blocking(lambda: os.abort())
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
@skip_if_rocm
|
|
def test_nccl_errors_blocking_sigkill(self):
|
|
self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL))
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
@skip_if_rocm
|
|
def test_nccl_errors_blocking_sigterm(self):
|
|
self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM))
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
|
|
@skip_if_lt_x_gpu(3)
|
|
def test_nccl_blocking_wait_with_barrier(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(
|
|
store,
|
|
self.rank,
|
|
self.world_size,
|
|
timeout=timedelta(seconds=10),
|
|
)
|
|
process_group.barrier().wait()
|
|
if self.rank == 0:
|
|
with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg):
|
|
# This should timeout
|
|
process_group.barrier().wait(timeout=timedelta(seconds=self.op_timeout_sec))
|
|
|
|
def _run_invalid_nccl_blocking_wait_env(self, val):
|
|
os.environ["NCCL_BLOCKING_WAIT"] = val
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
with self.assertRaises(RuntimeError):
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(3)
|
|
def test_invalid_nccl_blocking_wait_env(self):
|
|
self._run_invalid_nccl_blocking_wait_env("abc")
|
|
self._run_invalid_nccl_blocking_wait_env("-1")
|
|
self._run_invalid_nccl_blocking_wait_env("2147483647")
|
|
self._run_invalid_nccl_blocking_wait_env("4294967295")
|
|
|
|
def _check_valid_comm_exception(self, e):
|
|
exception_str = str(e)
|
|
valid_exceptions = [
|
|
"NCCL communicator was aborted",
|
|
"NCCL communicator encountered error",
|
|
"Caught collective operation timeout"
|
|
]
|
|
return any(exc in exception_str for exc in valid_exceptions)
|
|
|
|
def _wait_for_comm_abort(self, process_group, timeout=None):
|
|
"""
|
|
Waits for the watchdog thread to abort communicators for the process group.
|
|
"""
|
|
while True:
|
|
try:
|
|
if not timeout:
|
|
process_group.allreduce(torch.rand(10).cuda(self.rank)).wait()
|
|
else:
|
|
assert isinstance(timeout, timedelta)
|
|
process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(timeout=timeout)
|
|
except Exception as e:
|
|
if self._check_valid_comm_exception(e):
|
|
return
|
|
else:
|
|
raise e
|
|
time.sleep(1)
|
|
|
|
@with_nccl_blocking_wait
|
|
@requires_nccl()
|
|
@requires_gloo()
|
|
@skip_if_lt_x_gpu(3)
|
|
def test_nccl_timeout(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
|
|
# Initialize process_group.
|
|
process_group = c10d.ProcessGroupNCCL(
|
|
store, self.rank, self.world_size, timeout=timedelta(seconds=10)
|
|
)
|
|
# Control gloo pg used as go-ahead signal/barrier
|
|
# to coordinate btwn ranks.
|
|
pg_gloo = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
|
|
failed_collective_timeout = timedelta(milliseconds=100)
|
|
process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(timeout=timedelta(seconds=5))
|
|
|
|
if self.rank == 0:
|
|
# This should timeout in about 1 second.
|
|
# Watchdog may abort timed out work resulting in NCCL error instead of operation timed out.
|
|
with self.assertRaisesRegex(RuntimeError, self.blocking_wait_error_msg):
|
|
process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(timeout=failed_collective_timeout)
|
|
# Now do a barrier to tell other rank to go ahead.
|
|
pg_gloo.barrier().wait()
|
|
else:
|
|
# Wait on rank 0 to fail.
|
|
try:
|
|
pg_gloo.barrier().wait()
|
|
except Exception as e:
|
|
raise ValueError(f"Rank {self.rank} barrier timed out waiting for rank 0 with error: {str(e)}")
|
|
# Now verify communicators on this rank have
|
|
# been aborted by watchdog.
|
|
self._wait_for_comm_abort(process_group, failed_collective_timeout)
|
|
|
|
|
|
class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
|
|
def setUp(self):
|
|
super(CommTest, self).setUp()
|
|
# NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
|
|
# that use NCCL_BLOCKING_WAIT will test it as expected.
|
|
os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
|
self._spawn_processes()
|
|
|
|
def tearDown(self):
|
|
super(CommTest, self).tearDown()
|
|
try:
|
|
os.remove(self.file_name)
|
|
except OSError:
|
|
pass
|
|
|
|
def _test_broadcast_coalesced(self, process_group, device, root_rank):
|
|
half = torch.float16
|
|
|
|
# No support for float16 for CPU tensors
|
|
if device == torch.device("cpu"):
|
|
half = torch.float32
|
|
|
|
target = torch.arange(60, dtype=half, device=device).chunk(5)
|
|
target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
|
|
target += torch.arange(60, dtype=half, device=device).chunk(5)
|
|
target += torch.arange(60, dtype=torch.float64, device=device).chunk(5)
|
|
target += torch.arange(60, dtype=half, device=device).chunk(5)
|
|
target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
|
|
|
|
# The tensors to pass to broadcast are idential to the target
|
|
# only on the process that is the root of the broadcast.
|
|
if self.rank == root_rank:
|
|
tensors = list(tensor.clone() for tensor in target)
|
|
else:
|
|
tensors = list(torch.zeros_like(tensor) for tensor in target)
|
|
|
|
if self.rank != root_rank:
|
|
self.assertNotEqual(tensors, target)
|
|
|
|
c10d._broadcast_coalesced(
|
|
process_group, tensors, buffer_size=256, src=root_rank
|
|
)
|
|
|
|
if self.rank != root_rank:
|
|
self.assertEqual(tensors, target)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_broadcast_coalesced_nccl(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
device = torch.device("cuda:%d" % self.rank)
|
|
ranks = [0, 1]
|
|
for root_rank in ranks:
|
|
self._test_broadcast_coalesced(process_group, device, root_rank)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_all_reduce_coalesced_nccl(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
|
|
device = torch.device("cuda:%d" % self.rank)
|
|
tensors = [torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float) for i in range(5)]
|
|
torch.distributed.all_reduce_coalesced(tensors, group=process_group)
|
|
for i, t in enumerate(tensors):
|
|
self.assertEqual(t, torch.full_like(t, self.world_size * (i + (self.world_size + 1.) / 2.)))
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_sequence_num_set_default_pg_nccl(self):
|
|
torch.cuda.set_device(self.rank)
|
|
self._test_sequence_num_set_default_pg(backend="nccl")
|
|
|
|
@skip_if_lt_x_gpu(2)
|
|
@requires_nccl()
|
|
def test_sequence_num_incremented_nccl_default(self):
|
|
self._test_sequence_num_incremented_default_group("nccl")
|
|
|
|
@skip_if_lt_x_gpu(4)
|
|
@requires_nccl()
|
|
def test_sequence_num_incremented_nccl_subgroup(self):
|
|
if self.world_size < 4:
|
|
return sandcastle_skip("Test requires world_size of at least 4")
|
|
self._test_sequence_num_incremented_subgroup("nccl")
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_sequence_num_set_nccl_new_group(self):
|
|
torch.cuda.set_device(self.rank)
|
|
self._test_sequence_num_set_new_group(backend="nccl")
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_pass_nccl_options_high_priority_stream(self):
|
|
pg_opts = c10d.ProcessGroupNCCL.Options()
|
|
pg_opts.is_high_priority_stream = True
|
|
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
# Test init_process_group accepts options
|
|
dist.init_process_group(
|
|
"nccl",
|
|
world_size=self.world_size,
|
|
rank=self.rank,
|
|
store=store,
|
|
pg_options=pg_opts,
|
|
)
|
|
|
|
# Test with new_group
|
|
pg = c10d.new_group([0, 1], pg_options=pg_opts)
|
|
# test if the process group constructed with high priority stream
|
|
self.assertTrue(pg.options.is_high_priority_stream)
|
|
# test the process group works as expected
|
|
t = torch.tensor([self.rank + 1] * 10).cuda(self.rank)
|
|
pg.allreduce(t).wait()
|
|
expected_tensor = torch.tensor([3] * 10).cuda(self.rank)
|
|
self.assertEqual(expected_tensor, t)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_nccl_barrier(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
c10d.init_process_group(
|
|
backend="nccl", rank=self.rank, world_size=self.world_size, store=store
|
|
)
|
|
|
|
t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank)
|
|
c10d.all_reduce(t)
|
|
expected_tensor = torch.tensor([3] * 10).cuda(2 * self.rank)
|
|
self.assertEqual(expected_tensor, t)
|
|
|
|
# Test with new_group
|
|
pg = c10d.new_group([0, 1])
|
|
t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank)
|
|
pg.allreduce(t).wait()
|
|
self.assertEqual(expected_tensor, t)
|
|
|
|
pg = c10d.new_group([0])
|
|
if self.rank == 0:
|
|
t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank)
|
|
expected_tensor = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank)
|
|
pg.allreduce(t).wait()
|
|
self.assertEqual(expected_tensor, t)
|
|
|
|
pg = c10d.new_group([1])
|
|
if self.rank == 1:
|
|
t = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank)
|
|
expected_tensor = torch.tensor([self.rank + 1] * 10).cuda(2 * self.rank)
|
|
pg.allreduce(t).wait()
|
|
self.assertEqual(expected_tensor, t)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_nccl_barrier_timeout(self):
|
|
os.environ["ENABLE_NCCL_HEALTH_CHECK"] = "1"
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
if self.rank == 0:
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Health check failure"
|
|
):
|
|
c10d.init_process_group(
|
|
backend="nccl",
|
|
rank=self.rank,
|
|
world_size=self.world_size,
|
|
store=store,
|
|
timeout=timedelta(seconds=10),
|
|
)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_nccl_barrier_timeout_new_group(self):
|
|
os.environ["ENABLE_NCCL_HEALTH_CHECK"] = "1"
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
c10d.init_process_group(
|
|
backend="nccl",
|
|
rank=self.rank,
|
|
world_size=self.world_size,
|
|
store=store,
|
|
timeout=timedelta(seconds=10),
|
|
)
|
|
|
|
if self.rank == 0:
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Health check failure"
|
|
):
|
|
c10d.new_group([0, 1], timeout=timedelta(seconds=1))
|
|
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Timed out initializing process group"
|
|
):
|
|
c10d.new_group([0], timeout=timedelta(seconds=1))
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(4)
|
|
def test_nccl_barrier_timeout_new_group_non_member(self):
|
|
os.environ["ENABLE_NCCL_HEALTH_CHECK"] = "1"
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
c10d.init_process_group(
|
|
backend="nccl",
|
|
rank=self.rank,
|
|
world_size=self.world_size,
|
|
store=store,
|
|
timeout=timedelta(seconds=10),
|
|
)
|
|
|
|
if self.rank == 1:
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Health check failure"
|
|
):
|
|
c10d.new_group([0, 1], timeout=timedelta(seconds=1))
|
|
|
|
with self.assertRaisesRegex(
|
|
RuntimeError, "Timed out initializing process group"
|
|
):
|
|
c10d.new_group([0], timeout=timedelta(seconds=1))
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_barrier_device_ids(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
c10d.init_process_group(
|
|
backend="nccl", rank=self.rank, world_size=self.world_size, store=store
|
|
)
|
|
|
|
c10d.barrier(device_ids=[self.rank])
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
def test_nccl_barrier_device_ids_function_argument(self):
|
|
store = c10d.FileStore(self.file_name, self.world_size)
|
|
c10d.init_process_group(
|
|
backend="nccl", rank=self.rank, world_size=self.world_size, store=store
|
|
)
|
|
|
|
with self.assertRaisesRegex(RuntimeError, "Invalid function argument"):
|
|
c10d.barrier(device_ids=self.rank)
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["DETAIL"])
|
|
def test_nccl_warn_not_in_group_debug_detail(self):
|
|
self._test_warn_not_in_group(backend="nccl")
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["INFO"])
|
|
def test_nccl_warn_not_in_group_debug_info(self):
|
|
self._test_warn_not_in_group(backend="nccl")
|
|
|
|
@requires_nccl()
|
|
@skip_if_lt_x_gpu(2)
|
|
@with_dist_debug_levels(levels=["OFF"])
|
|
def test_nccl_warn_not_in_group_debug_off(self):
|
|
self._test_warn_not_in_group(backend="nccl")
|
|
|
|
if __name__ == "__main__":
|
|
assert (
|
|
not torch.cuda._initialized
|
|
), "test_distributed must not have initialized CUDA context on main process"
|
|
|
|
run_tests()
|