pytorch/test/test_cuda_primary_ctx.py
SsnL 8482efb203 pin_memory malloc now uses existing context if available. (#22229)
Summary:
This is achieved by using `cuDevicePrimaryCtxGetState` as a way to check whether a primary context exists on a device. It is not too slow, from this benchmark of a single call to it on CUDA 10.1, Titan Xp, driver 415.27:
```
---------------------------------------------------------------------
Benchmark                              Time           CPU Iterations
---------------------------------------------------------------------
BM_cuDevicePrimaryCtxGetState        301 ns        301 ns    2319746
```

Commits:

1. Add `CUDAHooks::getDeviceWithPrimaryContext` which returns a device index with primary context (if exists).
    Link `c10/cuda` against `libcuda` for device API calls.
2. Use `getDeviceWithPrimaryContext` to check primary context in `pin_memory`.
    Fix `OptionalDeviceGuard` doc.
3. Refactor `test_cuda_primary_ctx.py` to support multiple tests.
    Add test for this in that file.

Fixes https://github.com/pytorch/pytorch/issues/21081.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/22229

Differential Revision: D16170194

Pulled By: zou3519

fbshipit-source-id: 485a45f211b7844c9e69c63f3b3b75194a796c5d
2019-07-16 10:18:30 -07:00

101 lines
3.8 KiB
Python

import torch
from common_utils import TestCase, run_tests, skipIfRocm
import unittest
# NOTE: this needs to be run in a brand new process
# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
# because if we do that, the TEST_CUDNN line from common_cuda will be executed
# multiple times as well during the execution of this test suite, and it will
# cause CUDA OOM error on Windows.
TEST_CUDA = torch.cuda.is_available()
TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
if not TEST_CUDA:
print('CUDA not available, skipping tests')
TestCase = object # noqa: F811
class TestCudaPrimaryCtx(TestCase):
CTX_ALREADY_CREATED_ERR_MSG = (
"Tests defined in test_cuda_primary_ctx.py must be run in a process "
"where CUDA contexts are never created. Use either run_test.py or add "
"--subprocess to run each test in a different subprocess.")
@skipIfRocm
def setUp(self):
for device in range(torch.cuda.device_count()):
# Ensure context has not been created beforehand
self.assertFalse(torch._C._cuda_hasPrimaryContext(device), TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG)
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_str_repr(self):
x = torch.randn(1, device='cuda:1')
# We should have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
str(x)
repr(x)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_copy(self):
x = torch.randn(1, device='cuda:1')
# We should have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
y = torch.randn(1, device='cpu')
y.copy_(x)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_pin_memory(self):
x = torch.randn(1, device='cuda:1')
# We should have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.randn(3, device='cpu').pin_memory()
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.randn(3, device='cpu', pin_memory=True)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.zeros(3, device='cpu', pin_memory=True)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.empty(3, device='cpu', pin_memory=True)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = x.pin_memory()
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
if __name__ == '__main__':
run_tests()