diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
new file mode 100644
index 00000000000..a52ea72d915
--- /dev/null
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -0,0 +1,91 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/EmptyTensor.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <c10/core/CPUAllocator.h>
+
+namespace at {
+namespace detail {
+
+static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
+  if (pin_memory) {
+    return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+  }
+  return c10::GetCPUAllocator();
+}
+
+void check_size_nonnegative(IntArrayRef size) {
+  for (auto x: size) {
+    TORCH_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size);
+  }
+}
+
+TensorBase empty_generic(
+    IntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+  at::detail::check_size_nonnegative(size);
+
+  int64_t nelements = c10::multiply_integers(size);
+  caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
+  int64_t size_bytes = nelements * dtype.itemsize();
+  auto storage_impl = c10::make_intrusive<StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size_bytes,
+      allocator->allocate(size_bytes),
+      allocator,
+      /*resizeable=*/true);
+
+  auto tensor = detail::make_tensor_base<TensorImpl>(
+      std::move(storage_impl), ks, dtype);
+  // Default TensorImpl has size [0]
+  if (size.size() != 1 || size[0] != 0) {
+    tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
+  }
+
+  if (memory_format_opt.has_value()) {
+    // Restriding a just-created empty contiguous tensor does nothing.
+    if (*memory_format_opt != MemoryFormat::Contiguous) {
+      tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt);
+    }
+  }
+
+  return tensor;
+}
+
+TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
+                     c10::optional<c10::MemoryFormat> memory_format_opt) {
+  auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
+  constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
+  return empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
+}
+
+TensorBase empty_cpu(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+  auto device = device_or_default(device_opt);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
+
+  auto pin_memory = pinned_memory_or_default(pin_memory_opt);
+  auto dtype = dtype_or_default(dtype_opt);
+  return empty_cpu(size, dtype, pin_memory, memory_format_opt);
+}
+
+TensorBase empty_cpu(
+    IntArrayRef size, const TensorOptions &options) {
+  return at::detail::empty_cpu(
+      size,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt(),
+      options.memory_format_opt());
+}
+
+}} // namespace at::detail
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
new file mode 100644
index 00000000000..0f2bc0c63ea
--- /dev/null
+++ b/aten/src/ATen/EmptyTensor.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at {
+namespace detail {
+
+TORCH_API void check_size_nonnegative(IntArrayRef size);
+
+TORCH_API TensorBase empty_generic(
+    IntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    ScalarType dtype,
+    bool pin_memory=false,
+    c10::optional<c10::MemoryFormat> memory_format_opt=c10::nullopt);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    const TensorOptions &options);
+
+}}  // namespace at::detail
diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
index 09d5034c4a2..8eb10266d78 100644
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@@ -1,14 +1,7 @@
-// FastPass
-#ifdef _MSC_VER
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <math.h>
-#endif
-
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/EmptyTensor.h>
 #include <ATen/ScalarOps.h>
-#include <ATen/ATen.h>
-#include <ATen/Utils.h>
 
 namespace at {
 namespace {
@@ -32,7 +25,8 @@ Tensor& scalar_fill(Tensor& self, const Scalar& value) {
 Tensor scalar_tensor_static(const Scalar& s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
   at::tracer::impl::NoTracerDispatchMode tracer_guard;
   at::AutoDispatchBelowAutograd mode;
-  auto result = at::detail::empty_cpu({}, dtype_opt, c10::nullopt, device_opt, c10::nullopt, c10::nullopt);
+  Tensor result = at::detail::empty_cpu(
+      {}, dtype_opt, c10::nullopt, device_opt, c10::nullopt, c10::nullopt);
   scalar_fill(result, s);
   return result;
 }
diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp
index a6540f7a5b6..a0fbc499378 100644
--- a/aten/src/ATen/Utils.cpp
+++ b/aten/src/ATen/Utils.cpp
@@ -22,72 +22,6 @@ int _crash_if_asan(int arg) {
 }
 
 namespace detail {
-// empty_cpu is used in ScalarOps.h, which can be referenced by other ATen
-// files. Since we want to decouple direct referencing native symbols and only
-// access native symbols through dispatching, we move its implementation here.
-Tensor empty_cpu(
-    IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
-
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
-
-  bool pin_memory = pinned_memory_or_default(pin_memory_opt);
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  c10::Allocator* allocator;
-  if (pin_memory) {
-    allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
-  } else {
-    allocator = at::getCPUAllocator();
-  }
-  auto dtype = dtype_or_default(dtype_opt);
-
-  return empty_generic(size, allocator, at::DispatchKey::CPU, dtype, memory_format_opt);
-}
-
-Tensor empty_generic(
-  IntArrayRef size,
-  c10::Allocator* allocator,
-  // technically this can be inferred from the device, but usually the
-  // correct setting is obvious from the call site so just make callers
-  // pass it in
-  c10::DispatchKey dispatch_key,
-  ScalarType scalar_type,
-  c10::optional<c10::MemoryFormat> memory_format_opt) {
-
-  check_size_nonnegative(size);
-
-  int64_t nelements = c10::multiply_integers(size);
-  caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
-  int64_t size_bytes = nelements * dtype.itemsize();
-  auto storage_impl = c10::make_intrusive<StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      size_bytes,
-      allocator->allocate(size_bytes),
-      allocator,
-      /*resizeable=*/true);
-
-  auto tensor = detail::make_tensor<TensorImpl>(
-      std::move(storage_impl), dispatch_key, dtype);
-  // Default TensorImpl has size [0]
-  if (size.size() != 1 || size[0] != 0) {
-    tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
-  }
-
-  if (memory_format_opt.has_value()) {
-    // Restriding a just-created empty contiguous tensor does nothing.
-    if (*memory_format_opt != MemoryFormat::Contiguous) {
-      tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt);
-    }
-  }
-
-  return tensor;
-}
 
 template <typename T>
 Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index 0143e0c49b4..9160cbe2fed 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -2,6 +2,7 @@
 
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/Generator.h>
+#include <ATen/EmptyTensor.h>
 #include <ATen/Formatting.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/StorageImpl.h>
@@ -113,26 +114,9 @@ static inline T* get_generator_or_default(const c10::optional<Generator>& gen, c
   return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
 }
 
-inline void check_size_nonnegative(IntArrayRef size) {
-  for (auto x: size) {
-    TORCH_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size);
-  }
-}
+using at::detail::check_size_nonnegative;
 
 namespace detail {
-TORCH_API
-Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
-                 c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt);
-
-TORCH_API
-Tensor empty_generic(
-  IntArrayRef size,
-  c10::Allocator* allocator,
-  c10::DispatchKey dispatch_key,
-  ScalarType dtype,
-  c10::optional<c10::MemoryFormat> memory_format
-);
-
 
 template <typename T>
 TORCH_API
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index d22ad5a1035..f6c9901d21c 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -386,7 +386,7 @@ public:
   // While some of these accessors could be generated through templates,
   // we prefer to write them manually for clarity
 
-  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(false) {
+  IValue(at::TensorBase t) : tag(Tag::Tensor), is_intrusive_ptr(false) {
     new (&payload.as_tensor) at::Tensor(std::move(t));
   }
   bool isTensor() const {
diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp
index 224fc2ec80d..58e58044fe7 100644
--- a/aten/src/ATen/native/MetaTensor.cpp
+++ b/aten/src/ATen/native/MetaTensor.cpp
@@ -47,8 +47,9 @@ Tensor empty_meta(
 
   auto* allocator = GetMetaAllocator();
   auto dtype = dtype_or_default(dtype_opt);
+  constexpr c10::DispatchKeySet meta_ks(c10::DispatchKey::Meta);
   return at::detail::empty_generic(
-      size, allocator, at::DispatchKey::Meta, dtype, memory_format_opt);
+      size, allocator, meta_ks, dtype, memory_format_opt);
 }
 
 Tensor empty_strided_meta(
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 5e0198931d5..9a360b2179e 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1,7 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/CPUGeneratorImpl.h>
-#include <ATen/Utils.h>
 #include <ATen/Dispatch.h>
+#include <ATen/EmptyTensor.h>
 #include <ATen/Parallel.h>
 #include <ATen/MapAllocator.h>
 #include <ATen/NativeFunctions.h>
@@ -1081,8 +1081,9 @@ Tensor _efficientzerotensor(IntArrayRef size,
     auto device_ = device_or_default(device);
     auto allocator = ZeroTensorAllocator(device_);
     auto dtype_ = dtype_or_default(dtype);
+    constexpr auto zero_ks = at::DispatchKeySet(at::DispatchKey::ZeroTensor);
     return at::detail::empty_generic(
-        size, &allocator, at::DispatchKey::ZeroTensor, dtype_, c10::nullopt);
+        size, &allocator, zero_ks, dtype_, c10::nullopt);
 }
 
 Tensor& zeros_out(IntArrayRef size, Tensor& result) {
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index e52ec7609cb..224a66f8abf 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -284,7 +284,7 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
 Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   const auto weight_contig =
       weight.expect_contiguous(weight.suggest_memory_format());
-  auto output = at::detail::empty_cpu(
+  Tensor output = at::detail::empty_cpu(
       {0},
       at::kByte,
       weight_contig->layout(),
diff --git a/test/cpp/jit/test_backend_compiler_lib.cpp b/test/cpp/jit/test_backend_compiler_lib.cpp
index ec756ba2c11..0db8bd428e9 100644
--- a/test/cpp/jit/test_backend_compiler_lib.cpp
+++ b/test/cpp/jit/test_backend_compiler_lib.cpp
@@ -125,8 +125,7 @@ class BackendWithCompiler : public PyTorchBackendInterface {
               (x.scalar_type() == c10::ScalarType::Float &&
                h.scalar_type() == c10::ScalarType::Float),
               "Only float tensors are compatible for add and sub.");
-          auto y = at::detail::empty_cpu(
-              x.sizes(), c10::ScalarType::Float, {}, {}, {}, c10::nullopt);
+          at::Tensor y = at::detail::empty_cpu(x.sizes(), at::kFloat);
           auto x_ptr = float_data_ptr(x);
           auto h_ptr = float_data_ptr(h);
           auto y_ptr = float_data_ptr(y);
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index b6cc8d488c4..4d9aee2de2e 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -932,6 +932,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/CPUGeneratorImpl.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
+    "aten/src/ATen/EmptyTensor.cpp",
     "aten/src/ATen/ExpandUtils.cpp",
     "aten/src/ATen/FunctionalInverses.cpp",
     "aten/src/ATen/FunctionalStorageImpl.cpp",