mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44090 This is an initial commit pulling in the torchgpipe fork at https://github.com/facebookresearch/fairscale. The purpose of this commit is to just pull in the code and ensure all tests and builds work fine. We will slowly modify this to match our intended API mentioned in https://fb.quip.com/txurAV3zIFox#RPZACAfAKMq. Follow up PRs would address further changes needed on top of the initial commit.. We're pulling the code into the `torch.distributed._pipeline.sync` package. The package is private on purpose since there is a lot of work (ex: docs, API changes etc.) that needs to go in before we can actually officially support this. ghstack-source-id: 114864254 Test Plan: 1) waitforbuildbot 2) Ran all tests on my devgpu Reviewed By: mrshenli Differential Revision: D23493316 fbshipit-source-id: fe3c8b7dadeeb86abdc00e8a8652491b0b16743a
189 lines
5.8 KiB
Python
189 lines
5.8 KiB
Python
# Copyright 2019 Kakao Brain
|
|
#
|
|
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
|
#
|
|
# This source code is licensed under the BSD license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
import pytest
|
|
import torch
|
|
|
|
from torch.distributed._pipeline.sync.stream import (
|
|
CPUStream,
|
|
current_stream,
|
|
default_stream,
|
|
get_device,
|
|
is_cuda,
|
|
new_stream,
|
|
record_stream,
|
|
use_device,
|
|
use_stream,
|
|
wait_stream,
|
|
)
|
|
|
|
skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
|
|
|
|
|
|
class TestNewStream:
|
|
def test_new_stream_cpu(self):
|
|
stream = new_stream(torch.device("cpu"))
|
|
assert stream is CPUStream
|
|
|
|
@skip_if_no_cuda
|
|
def test_new_stream_cuda(self):
|
|
stream = new_stream(torch.device("cuda"))
|
|
assert isinstance(stream, torch.cuda.Stream)
|
|
assert stream != torch.cuda.default_stream()
|
|
|
|
|
|
class TestCurrentStream:
|
|
def test_current_stream_cpu(self):
|
|
stream = current_stream(torch.device("cpu"))
|
|
assert stream is CPUStream
|
|
|
|
@skip_if_no_cuda
|
|
def test_current_stream_cuda(self):
|
|
stream = current_stream(torch.device("cuda"))
|
|
assert isinstance(stream, torch.cuda.Stream)
|
|
assert stream == torch.cuda.current_stream()
|
|
|
|
|
|
class TestDefaultStream:
|
|
def test_default_stream_cpu(self):
|
|
stream = default_stream(torch.device("cpu"))
|
|
assert stream is CPUStream
|
|
|
|
@skip_if_no_cuda
|
|
def test_default_stream_cuda(self):
|
|
stream = default_stream(torch.device("cuda"))
|
|
assert isinstance(stream, torch.cuda.Stream)
|
|
assert stream == torch.cuda.default_stream()
|
|
|
|
|
|
class TestUseDevice:
|
|
def test_use_device_cpu(self):
|
|
with use_device(torch.device("cpu")):
|
|
pass
|
|
|
|
@skip_if_no_cuda
|
|
def test_use_device_cuda(self):
|
|
with use_device(torch.device("cuda")):
|
|
pass
|
|
|
|
|
|
class TestUseStream:
|
|
def test_use_stream_cpu(self):
|
|
with use_stream(CPUStream):
|
|
pass
|
|
|
|
@skip_if_no_cuda
|
|
def test_use_stream_cuda(self):
|
|
stream = new_stream(torch.device("cuda"))
|
|
with use_stream(stream):
|
|
assert current_stream(torch.device("cuda")) == stream
|
|
|
|
|
|
class TestGetDevice:
|
|
def test_get_device_cpu(self):
|
|
assert get_device(CPUStream).type == "cpu"
|
|
|
|
@skip_if_no_cuda
|
|
def test_get_device_cuda(self):
|
|
stream = current_stream(torch.device("cuda"))
|
|
assert get_device(stream).type == "cuda"
|
|
|
|
|
|
class TestWaitStream:
|
|
def _test_wait_stream(self, source, target, cuda_sleep=None):
|
|
with use_stream(target):
|
|
if is_cuda(target):
|
|
cuda_sleep(0.5)
|
|
x = torch.ones(100, 100, device=get_device(target))
|
|
|
|
wait_stream(source, target)
|
|
|
|
with use_stream(source):
|
|
assert x.sum().item() == 10000
|
|
|
|
def test_wait_stream_cpu_cpu(self):
|
|
source = CPUStream
|
|
target = CPUStream
|
|
self._test_wait_stream(source, target)
|
|
|
|
@skip_if_no_cuda
|
|
def test_wait_stream_cpu_cuda(self, cuda_sleep):
|
|
source = CPUStream
|
|
target = new_stream(torch.device("cuda"))
|
|
self._test_wait_stream(source, target, cuda_sleep)
|
|
|
|
@skip_if_no_cuda
|
|
def test_wait_stream_cuda_cpu(self, cuda_sleep):
|
|
source = new_stream(torch.device("cuda"))
|
|
target = CPUStream
|
|
self._test_wait_stream(source, target, cuda_sleep)
|
|
|
|
@skip_if_no_cuda
|
|
def test_wait_stream_cuda_cuda(self, cuda_sleep):
|
|
source = current_stream(torch.device("cuda"))
|
|
target = new_stream(torch.device("cuda"))
|
|
self._test_wait_stream(source, target, cuda_sleep)
|
|
|
|
|
|
class TestRecordStream:
|
|
def test_record_stream_cpu(self):
|
|
# It should silently ignore CPU tensors.
|
|
x = torch.rand(1, device=torch.device("cpu"))
|
|
record_stream(x, CPUStream)
|
|
|
|
@skip_if_no_cuda
|
|
def test_record_stream_cuda(self, cuda_sleep):
|
|
# This test detects unexpected block reallocation. For reliable test,
|
|
# the stream to allocate tensors is isolated. The allocator will not
|
|
# reuse free blocks which were allocated from another stream.
|
|
stream_alloc = new_stream(torch.device("cuda"))
|
|
with torch.cuda.stream(stream_alloc):
|
|
x = torch.rand(1, device=torch.device("cuda"))
|
|
|
|
stream = new_stream(torch.device("cuda"))
|
|
record_stream(x, stream)
|
|
with use_stream(stream):
|
|
cuda_sleep(0.5)
|
|
|
|
# 'x' is deleted at Python's perspective. But the block of 'x' is still
|
|
# required for 'stream'. 'y' shouldn't be allocated to the block.
|
|
data_ptr = x.data_ptr()
|
|
del x
|
|
stream_alloc.synchronize()
|
|
with torch.cuda.stream(stream_alloc):
|
|
y = torch.rand(1, device=torch.device("cuda"))
|
|
assert y.data_ptr() != data_ptr
|
|
|
|
# Pause Python until 'stream' finishes tasks queued. Now the block of
|
|
# 'x' is free to be reallocated.
|
|
wait_stream(CPUStream, stream)
|
|
with torch.cuda.stream(stream_alloc):
|
|
z = torch.rand(1, device=torch.device("cuda"))
|
|
assert z.data_ptr() == data_ptr
|
|
|
|
@skip_if_no_cuda
|
|
def test_record_stream_shifted_view(self, cuda_sleep):
|
|
# Issue: https://github.com/pytorch/pytorch/issues/27366
|
|
stream_alloc = new_stream(torch.device("cuda"))
|
|
with torch.cuda.stream(stream_alloc):
|
|
x = torch.rand(2, device=torch.device("cuda"))
|
|
|
|
y = x[1:]
|
|
assert y.data_ptr() > x.data_ptr()
|
|
|
|
stream = new_stream(torch.device("cuda"))
|
|
with use_stream(stream):
|
|
cuda_sleep(0.5)
|
|
record_stream(y, stream)
|
|
|
|
data_ptr = x.data_ptr()
|
|
del x, y
|
|
|
|
stream_alloc.synchronize()
|
|
with torch.cuda.stream(stream_alloc):
|
|
z = torch.rand(2, device=torch.device("cuda"))
|
|
assert z.data_ptr() != data_ptr
|