mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary:
To support exporting a cuda model on a CPU-only machine under fake tensor mode.
User commonly need to move sample inputs to the cuda device with .to("cuda:0") or .to("cuda") call.
This diff supports this.
I expect the following pattern to work
```
with FakeTensorMode(allow_non_fake_inputs=True):
cuda_module = module.to("cuda:0")
cuda_sample_inputs = tuple([x.to("cuda:0") for x in sample_inputs])
with torch.no_grad():
ep = torch.export.export(cuda_module, cuda_sample_inputs)
```
Test Plan:
CI
Rollback Plan:
Differential Revision: D80181887
Pull Request resolved: https://github.com/pytorch/pytorch/pull/160532
Approved by: https://github.com/henryoier, https://github.com/ezyang
42 lines
1.3 KiB
C++
42 lines
1.3 KiB
C++
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
|
#include <c10/core/impl/FakeGuardImpl.h>
|
|
#include <array>
|
|
|
|
namespace c10::impl {
|
|
|
|
std::array<
|
|
std::atomic<const DeviceGuardImplInterface*>,
|
|
static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
|
|
device_guard_impl_registry;
|
|
|
|
DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
|
|
DeviceType type,
|
|
const DeviceGuardImplInterface* impl) {
|
|
device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
|
|
}
|
|
|
|
namespace {
|
|
thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
|
|
nullptr;
|
|
}
|
|
|
|
void ensureCUDADeviceGuardSet() {
|
|
constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
|
|
|
|
const DeviceGuardImplInterface* p =
|
|
device_guard_impl_registry[cuda_idx].load();
|
|
|
|
// A non-null `ptr` indicates that CUDA is already available.
|
|
if (p == nullptr || (p && p->deviceCount() == 0)) {
|
|
// In following cases, we override CUDA guard interface with a no-op
|
|
// device guard.
|
|
// 1. p == nullptr; Trying to get a cuda device guard on a cpu-only build.
|
|
// 2. p->deviceCount() == 0; cuda build enabled, but no cuda devices
|
|
// available.
|
|
tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
|
|
device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
|
|
}
|
|
}
|
|
|
|
} // namespace c10::impl
|