Revert D33834916: Set correct device id on efficientzerotensors

Test Plan: revert-hammer Differential Revision: D33834916 (a18cfb790d) Original commit changeset: 11cec343e95e Original Phabricator Diff: D33834916 (a18cfb790d) fbshipit-source-id: 3d3f60b760b445383768161b1d21ea4dadbe5d7c (cherry picked from commit eba41aa646)
2025-12-06 12:20:52 +01:00 · 2022-01-30 19:45:25 -08:00 · 2022-01-30 19:45:25 -08:00 · 1e4aefaa2f
commit 1e4aefaa2f
parent 6208c2800e
6 changed files with 49 additions and 45 deletions
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -427,6 +427,27 @@ Tensor& eye_out_cpu(int64_t n, int64_t m, Tensor& result) {
 namespace {
 // The ZeroTensor allocator ignores whatever allocation is requested and always
 // gives you nullptr
 struct ZeroTensorAllocator final : public at::Allocator {
  ZeroTensorAllocator(at::Device device) : device_(device) {};
  ~ZeroTensorAllocator() override = default;
  static void deleter(void* const pointer) {
    TORCH_INTERNAL_ASSERT(!pointer);
  }
  DataPtr allocate(const size_t nbytes) const override {
    return {nullptr, nullptr, &deleter, device_};
  }
  DeleterFnPtr raw_deleter() const override {
    return deleter;
  }
  at::Device device_;
 };
 at::Allocator* GetZeroTensorAllocator(ZeroTensorAllocator& zt) {
  return &zt;
 }
 // Performs dtype inference for full
 TensorOptions infer_full_options(
  const Scalar& fill_value,
@ -1055,11 +1076,11 @@ Tensor _efficientzerotensor(IntArrayRef size,
    c10::optional<Device> device,
    c10::optional<bool> pin_memory) {
    auto device_ = device_or_default(device);
-    auto allocator = at::native::ZeroTensorAllocator(device_);
+    auto allocator = ZeroTensorAllocator(device_);
    auto dtype_ = dtype_or_default(dtype);
-    auto zero_ks = at::DispatchKeySet(c10::DispatchKey::CPU) | at::DispatchKeySet(c10::DispatchKey::ZeroTensor);
+    constexpr auto zero_ks = at::DispatchKeySet(at::DispatchKey::ZeroTensor);
-    auto out = at::detail::empty_generic(size, &allocator, zero_ks, dtype_, c10::nullopt);
+    return at::detail::empty_generic(
-    return out;
+        size, &allocator, zero_ks, dtype_, c10::nullopt);
 }
 Tensor& zeros_out(IntArrayRef size, Tensor& result) {
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@ -87,23 +87,6 @@ inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tens
  }
 }
 // The ZeroTensor allocator ignores whatever allocation is requested and always
 // gives you nullptr
 struct ZeroTensorAllocator final : public at::Allocator {
  ZeroTensorAllocator(at::Device device) : device_(device) {};
  ~ZeroTensorAllocator() override = default;
  static void deleter(void* const pointer) {
    TORCH_INTERNAL_ASSERT(!pointer);
  }
  DataPtr allocate(const size_t nbytes) const override {
    return {nullptr, nullptr, &deleter, device_};
  }
  DeleterFnPtr raw_deleter() const override {
    return deleter;
  }
  at::Device device_;
 };
 using binary_fn = void (*)(TensorIterator&);
 DECLARE_DISPATCH(binary_fn, complex_stub);
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@ -40,23 +40,6 @@ Tensor empty_cuda(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::op
  return at::detail::empty_cuda(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
 }
 Tensor _efficientzerotensor_cuda(IntArrayRef size,
    c10::optional<ScalarType> dtype,
    c10::optional<Layout> layout,
    c10::optional<Device> device,
    c10::optional<bool> pin_memory) {
    auto device_ = device_or_default(device);
    if (!device_.has_index()) {
      device_.set_index(at::cuda::current_device());
    }
    auto allocator = at::native::ZeroTensorAllocator(device_);
    auto dtype_ = dtype_or_default(dtype);
    auto zero_ks = at::DispatchKeySet(c10::DispatchKey::CUDA) | at::DispatchKeySet(c10::DispatchKey::ZeroTensor);
    auto out = at::detail::empty_generic(size, &allocator, zero_ks, dtype_, c10::nullopt);
    return out;
 }
 Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
  return at::detail::empty_strided_cuda(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -4807,8 +4807,7 @@
 - func: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
-    CPU: _efficientzerotensor
+    CompositeExplicitAutograd: _efficientzerotensor
    CUDA: _efficientzerotensor_cuda
 - func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -5477,10 +5477,6 @@ class TestDevicePrecision(TestCase):
            actual = x[..., :1].clamp(lb, ub)
            self.assertEqual(expect, actual)
    def test_cuda_device_idx(self, device):
        x = torch.zeros(3, device=device)
        y = torch._efficientzerotensor(3, device=device)
        self.assertEqual(x.device, y.device)
 # we implemented custom deallocation for subclasses, so it behooves
 # us to make sure all of these bits work.  We'll use __del__ to
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@ -9069,6 +9069,9 @@ op_db: List[OpInfo] = [
                    assert_autodiffed=True,
                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
                    skips=(
                        # 69913: RuntimeError: CUDA error: an illegal memory access was encountered
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
                                     device_type='cuda', dtypes=[torch.double, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD',
                                     device_type='cuda', dtypes=[torch.double, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_inplace_forward_mode_AD',
@ -9085,6 +9088,9 @@ op_db: List[OpInfo] = [
                    assert_autodiffed=True,
                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
                    skips=(
                        # 69913: RuntimeError: CUDA error: an illegal memory access was encountered
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
                                     device_type='cuda', dtypes=[torch.double, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD',
                                     device_type='cuda', dtypes=[torch.double, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_inplace_forward_mode_AD',
@ -9101,6 +9107,9 @@ op_db: List[OpInfo] = [
                    assert_autodiffed=True,
                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
                    skips=(
                        # 69913: RuntimeError: CUDA error: an illegal memory access was encountered
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
                                     device_type='cuda', dtypes=[torch.double, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD',
                                     device_type='cuda', dtypes=[torch.double, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_inplace_forward_mode_AD',
@ -9681,6 +9690,11 @@ op_db: List[OpInfo] = [
               # RuntimeError:
               # Arguments for call are not valid.
               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, torch.complex64)),  # noqa: B950
               # 69925: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='cuda'),
               # (ROCm) Memory exception on virtual address 0x7f6f3deb7000, node id 4: Page not present
               DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
                            device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM),
           ),
           supports_inplace_autograd=False,
           sample_inputs_func=sample_inputs_gradient),
@ -14018,7 +14032,15 @@ op_db: List[OpInfo] = [
           supports_forward_ad=True,
           supports_fwgrad_bwgrad=True,
           supports_out=False,
-           sample_inputs_func=sample_cumulative_trapezoid,),
+           sample_inputs_func=sample_cumulative_trapezoid,
           skips=(
               # Two failures:
               # 1. (CUDA) RuntimeError: Expected all tensors to be on the same device, but found at
               #    least two devices, cuda:0 and cpu!
               # 2. (ROCm) Memory exception on virtual address 0x7f6a2216f000, node id 4: Page not present
               DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients',
                            'test_fn_fwgrad_bwgrad', device_type='cuda'),
           )),
    OpInfo('unsqueeze',
           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
           supports_out=False,