Move THPStorage definitions out of torch/csrc/generic (#78032)

Fixes #77908 Pull Request resolved: https://github.com/pytorch/pytorch/pull/78032 Approved by: https://github.com/ezyang
2025-12-06 12:20:52 +01:00 · 2022-06-01 19:00:58 +00:00 · 2022-06-01 19:00:58 +00:00 · 272193d026
commit 272193d026
parent 6a4997e66a
15 changed files with 726 additions and 708 deletions
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -843,6 +843,8 @@ libtorch_python_core_sources = [
    "torch/csrc/python_dimname.cpp",
    "torch/csrc/Size.cpp",
    "torch/csrc/Storage.cpp",
+    "torch/csrc/StorageMethods.cpp",
+    "torch/csrc/StorageSharing.cpp",
    "torch/csrc/Stream.cpp",
    "torch/csrc/TypeInfo.cpp",
    "torch/csrc/api/src/python/init.cpp",
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -864,7 +864,7 @@ class ThroughputBenchmark(object):
    def run_once(self, *args: Any, **kwargs: Any) -> Any: ...
    def benchmark(self, config: BenchmarkConfig) -> BenchmarkExecutionStats: ...

-# Defined in torch/csrc/generic/Storage.cpp
+# Defined in torch/csrc/Storage.cpp
 ${legacy_storage_base_hints}

 # TODO: where
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@ -11,14 +11,12 @@
 #include <torch/csrc/CudaIPCTypes.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/StorageMethods.h>
+#include <torch/csrc/StorageSharing.h>
 #include <c10/core/CPUAllocator.h>

 #include <fmt/format.h>
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <torch/csrc/generic/Storage.cpp>
-#include <torch/csrc/THGenerateByteType.h>
-
 #include <c10/util/intrusive_ptr.h>

 template<>
@ -27,3 +25,358 @@ void THPPointer<c10::StorageImpl>::free() {
    c10::raw::intrusive_ptr::decref(ptr);
  }
 }
+
+PyObject *THPStorageClass = nullptr;
+
+PyObject * THPStorage_New(c10::intrusive_ptr<c10::StorageImpl> ptr)
+{
+  AT_ASSERT(ptr);
+  PyTypeObject *type = (PyTypeObject *)THPStorageClass;
+  PyObject *obj = type->tp_alloc(type, 0);
+  if (obj) {
+    ((THPStorage *)obj)->cdata = ptr.release();
+  }
+  return obj;
+}
+
+static void THPStorage_dealloc(THPStorage* self)
+{
+  if (self->cdata) {
+    c10::raw::intrusive_ptr::decref(self->cdata);
+  }
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject * THPStorage_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+  HANDLE_TH_ERRORS
+
+  static torch::PythonArgParser parser({
+      THPStorageStr "(*, int64_t allocator=None, Device device=None)",
+      THPStorageStr "(int64_t size, *, int64_t allocator=None, Device device=None)",
+      THPStorageStr "(PyObject* sequence, *, int64_t allocator=None, Device device=None)",
+  });
+  torch::ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  int64_t allocator_arg_idx = 0;
+  int64_t device_arg_idx = 1;
+
+  if (r.idx > 0) {
+    allocator_arg_idx = 1;
+    device_arg_idx = 2;
+  }
+
+  c10::optional<int64_t> allocator_opt = r.toInt64Optional(allocator_arg_idx);
+  c10::optional<at::Device> device_opt = r.deviceOptional(device_arg_idx);
+
+  TORCH_CHECK(!allocator_opt.has_value() || !device_opt.has_value(),
+    THPStorageStr, "(): only one or neither of 'allocator' or 'device' can ",
+    "be given, but not both");
+
+  THPStoragePtr self((THPStorage *)type->tp_alloc(type, 0));
+  THPUtils_assert(self, "failed to allocate a " THPStorageStr " object");
+  c10::Allocator* allocator = nullptr;
+  at::OptionalDeviceGuard device_guard;
+
+  if (allocator_opt.has_value()) {
+    allocator = reinterpret_cast<c10::Allocator*>(allocator_opt.value());
+  } else if (device_opt.has_value()) {
+    at::Device device = device_opt.value();
+    if (device.type() == at::kCPU) {
+      allocator = c10::GetDefaultCPUAllocator();
+#ifdef USE_CUDA
+    } else if (device.type() == at::kCUDA) {
+      at::globalContext().lazyInitCUDA();
+      allocator = c10::cuda::CUDACachingAllocator::get();
+#endif
+    } else if (device.type() == at::DeviceType::Meta) {
+      allocator = c10::GetAllocator(device.type());
+    } else {
+      TORCH_CHECK(false,
+        THPStorageStr, "(): Storage device not recognized: ", device.type());
+    }
+    device_guard.reset_device(device);
+  } else {
+    allocator = c10::GetDefaultCPUAllocator();
+  }
+
+  // torch.Storage(*, ...)
+  if (r.idx == 0) {
+    self->cdata = c10::make_intrusive<at::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      0,
+      allocator,
+      /*resizable=*/true).release();
+    return (PyObject*)self.release();
+
+  // torch.Storage(size, *, ...)
+  } else if (r.idx == 1) {
+    int64_t size = r.toInt64(0);
+    self->cdata = c10::make_intrusive<at::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size,
+      allocator,
+      /*resizable=*/true).release();
+    return (PyObject*)self.release();
+
+  // torch.Storage(sequence, *, ...)
+  } else if (r.idx == 2) {
+    PyObject *sequence = r.pyobject(0);
+    Py_ssize_t length = PySequence_Length(sequence);
+    TORCH_CHECK(PySequence_Check(sequence),
+      THPStorageStr, "(): Expected a sequence type, but got ",
+      THPUtils_typename(sequence));
+    TORCH_CHECK(length >= 0,
+      THPStorageStr, "(): Could not obtain the length of sequence of type ",
+      THPUtils_typename(sequence));
+    self->cdata = c10::make_intrusive<at::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      length,
+      allocator,
+      /*resizable=*/true)
+      .release();
+    THPObjectPtr item;
+    try {
+      for (Py_ssize_t i = 0; i < length; i++) {
+        item = PySequence_GetItem(sequence, i);
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        uint8_t value = THPByteUtils_unpackReal(item.get());
+        if (allocator == c10::GetDefaultCPUAllocator()) {
+          self->cdata->unsafe_data<uint8_t>()[i] = value;
+        } else {
+          // TODO: this might be slow - consider batched updates?
+          storage_set(
+            at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
+            i,
+            value);
+        }
+      }
+    } catch (const std::exception &e) {
+      THPUtils_setError(THPStorageStr
+          "(): tried to construct a storage from a sequence (%s), "
+          "but one of the items was of type %s instead of %s",
+          THPUtils_typename(sequence),
+          THPUtils_typename(item.get()),
+          THPUtils_typeTraits<uint8_t>::python_type_str);
+      return nullptr;
+    }
+    return (PyObject*)self.release();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static Py_ssize_t THPStorage_length(THPStorage *self)
+{
+  HANDLE_TH_ERRORS
+  return self->cdata->nbytes() / sizeof(uint8_t);
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+static PyObject * THPStorage_get(THPStorage *self, PyObject *index)
+{
+  HANDLE_TH_ERRORS
+  /* Integer index */
+  if (THPUtils_checkLong(index)) {
+    int64_t nindex = THPUtils_unpackLong(index);
+    if (nindex < 0)
+      nindex += (self->cdata->nbytes() / sizeof(uint8_t));
+    if (nindex < 0 || nindex >= static_cast<int64_t>(self->cdata->nbytes() / sizeof(uint8_t))) {
+      PyErr_SetString(PyExc_IndexError, fmt::format(
+            "index {} out of range for storage of size {}",
+            nindex, self->cdata->nbytes() / sizeof(uint8_t)));
+      return nullptr;
+    }
+    uint8_t value = storage_get(at::unsafeStorageFromTH(self->cdata, /*retain=*/true), nindex);
+    return THPByteUtils_newReal(value);
+  /* Slice index */
+  } else if (PySlice_Check(index)) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    Py_ssize_t start, stop, slicelength, step;
+    int64_t len = self->cdata->nbytes() / sizeof(uint8_t);
+    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
+      return nullptr;
+    if (step != 1) {
+      THPUtils_setError("Trying to slice with a step of %lld, but only a step of "
+          "1 is supported", (long long)step);
+      return nullptr;
+    }
+
+    uint8_t *data = self->cdata->data<uint8_t>();
+
+    at::StorageImpl* old_storage = self->cdata;
+    c10::raw::intrusive_ptr::incref(old_storage);
+    auto new_storage = c10::make_intrusive<at::StorageImpl>(
+        c10::StorageImpl::use_byte_size_t(),
+#ifdef THQUANTIZED
+        slicelength * sizeof(quantized_t),
+#else
+        slicelength * sizeof(uint8_t),
+#endif
+        at::DataPtr(
+            static_cast<void*>(data + start),
+            old_storage,
+            [](void* s) {
+              c10::raw::intrusive_ptr::decref(static_cast<at::StorageImpl*>(s));
+            },
+            old_storage->device()),
+        old_storage->allocator(),
+        /* resizable */ false);
+
+    PyObject *_ret = THPStorage_New(std::move(new_storage));
+    return _ret;
+  }
+  PyErr_Format(PyExc_TypeError, "can't index a " THPStorageStr " with %s",
+      THPUtils_typename(index));
+  return nullptr;
+  END_HANDLE_TH_ERRORS
+}
+
+static int THPStorage_set(THPStorage *self, PyObject *index, PyObject *value)
+{
+  HANDLE_TH_ERRORS
+  if (!THPByteUtils_checkReal(value)) {
+    THPUtils_setError("can only set storage content with a %s, but got "
+        "%s instead", THPUtils_typeTraits<uint8_t>::python_type_str,
+        THPUtils_typename(value));
+    return -1;
+  }
+
+  uint8_t rvalue = THPByteUtils_unpackReal(value);
+  if (THPUtils_checkLong(index)) {
+    int64_t nindex = THPUtils_unpackLong(index);
+    storage_set(
+      at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
+      nindex,
+      rvalue);
+    return 0;
+  } else if (PySlice_Check(index)) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    Py_ssize_t start, stop, slicelength, step;
+    int64_t len = self->cdata->nbytes() / sizeof(uint8_t);
+    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
+      return -1;
+    if (step != 1) {
+      THPUtils_setError("Trying to slice with a step of %lld, but only a step of "
+          "1 is supported", (long long)step);
+      return 0;
+    }
+    // TODO: check the bounds only once
+    // TODO: fill?
+    for (;start < stop; start++)
+      storage_set(
+        at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
+        start,
+        rvalue);
+    return 0;
+  }
+  THPUtils_setError("can't index a " THPStorageStr " with %s",
+      THPUtils_typename(index));
+  return -1;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
+static PyMappingMethods THPStorage_mappingmethods = {
+  (lenfunc)THPStorage_length,
+  (binaryfunc)THPStorage_get,
+  (objobjargproc)THPStorage_set
+};
+
+// TODO: implement equality
+PyTypeObject THPStorageType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch._C." THPStorageBaseStr,               /* tp_name */
+  sizeof(THPStorage),                          /* tp_basicsize */
+  0,                                           /* tp_itemsize */
+  (destructor)THPStorage_dealloc,              /* tp_dealloc */
+  0,                                           /* tp_vectorcall_offset */
+  nullptr,                                     /* tp_getattr */
+  nullptr,                                     /* tp_setattr */
+  nullptr,                                     /* tp_reserved */
+  nullptr,                                     /* tp_repr */
+  nullptr,                                     /* tp_as_number */
+  nullptr,                                     /* tp_as_sequence */
+  &THPStorage_mappingmethods,                  /* tp_as_mapping */
+  nullptr,                                     /* tp_hash  */
+  nullptr,                                     /* tp_call */
+  nullptr,                                     /* tp_str */
+  nullptr,                                     /* tp_getattro */
+  nullptr,                                     /* tp_setattro */
+  nullptr,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,    /* tp_flags */
+  nullptr,                                     /* tp_doc */
+  nullptr,                                     /* tp_traverse */
+  nullptr,                                     /* tp_clear */
+  nullptr,                                     /* tp_richcompare */
+  0,                                           /* tp_weaklistoffset */
+  nullptr,                                     /* tp_iter */
+  nullptr,                                     /* tp_iternext */
+  nullptr,   /* will be assigned in init */    /* tp_methods */
+  nullptr,   /* will be assigned in init */    /* tp_members */
+  nullptr,                                     /* tp_getset */
+  nullptr,                                     /* tp_base */
+  nullptr,                                     /* tp_dict */
+  nullptr,                                     /* tp_descr_get */
+  nullptr,                                     /* tp_descr_set */
+  0,                                           /* tp_dictoffset */
+  nullptr,                                     /* tp_init */
+  nullptr,                                     /* tp_alloc */
+  THPStorage_pynew,                            /* tp_new */
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
+static struct PyMemberDef THPStorage_members[] = {
+  {(char*)"_cdata", T_ULONGLONG, offsetof(THPStorage, cdata), READONLY, nullptr},
+  {nullptr}
+};
+
+static PyObject * THPStorage_device(THPStorage* self, void *unused) {
+  HANDLE_TH_ERRORS
+  return THPDevice_New(self->cdata->device());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPStorage_dtype(THPStorage *self, void *unused)
+{
+  HANDLE_TH_ERRORS
+  return torch::autograd::utils::wrap(
+      torch::getTHPDtype(at::typeMetaToScalarType(
+#ifdef THQUANTIZED
+          caffe2::TypeMeta::Make<quantized_t>()
+#else
+          caffe2::TypeMeta::Make<uint8_t>()
+#endif
+              )));
+  END_HANDLE_TH_ERRORS
+}
+
+typedef PyObject *(*getter)(PyObject *, void *);
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
+static struct PyGetSetDef THPStorage_properties[] = {
+  {"device", (getter)THPStorage_device, nullptr, nullptr, nullptr},
+  {nullptr}
+};
+
+bool THPStorage_init(PyObject *module)
+{
+  static std::vector<PyMethodDef> methods;
+  THPUtils_addPyMethodDefs(methods, THPStorage_getMethods());
+  THPUtils_addPyMethodDefs(methods, THPStorage_getSharingMethods());
+
+  THPStorageType.tp_methods = methods.data();
+  THPStorageType.tp_members = THPStorage_members;
+  THPStorageType.tp_getset = THPStorage_properties;
+  if (PyType_Ready(&THPStorageType) < 0)
+    return false;
+  Py_INCREF(&THPStorageType);
+  PyModule_AddObject(module, THPStorageBaseStr, (PyObject *)&THPStorageType);
+  return true;
+}
+
+void THPStorage_postInit(PyObject *module)
+{
+  THPStorageClass = PyObject_GetAttrString(module, "_UntypedStorage");
+  if (!THPStorageClass) throw python_error();
+}
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@ -1,18 +1,22 @@
 #ifndef THP_STORAGE_INC
 #define THP_STORAGE_INC
-#include <torch/csrc/THConcat.h>
+
+#include <torch/csrc/Types.h>

 #define THPStorageStr "torch._UntypedStorage"
-#define THPStorage_(NAME) TH_CONCAT_2(THPStorage_,NAME)
-
-#define THPStorage_Check(obj) \
-    PyObject_IsInstance(obj, THPStorageClass)
-
-#define THPStorage_CData(obj)           (obj)->cdata
-
 #define THPStorageBaseStr "StorageBase"

-#include <torch/csrc/generic/Storage.h>
-#include <torch/csrc/THGenerateByteType.h>
+struct THPStorage {
+  PyObject_HEAD
+  c10::StorageImpl *cdata;
+};
+
+TORCH_PYTHON_API PyObject * THPStorage_New(c10::intrusive_ptr<c10::StorageImpl> ptr);
+extern PyObject *THPStorageClass;
+
+bool THPStorage_init(PyObject *module);
+void THPStorage_postInit(PyObject *module);
+
+extern PyTypeObject THPStorageType;

 #endif
--- a/torch/csrc/StorageDefs.h
+++ b/torch/csrc/StorageDefs.h
@ -1,5 +0,0 @@
-#pragma once
-struct THPStorage {
-  PyObject_HEAD
-  c10::StorageImpl *cdata;
-};
--- a/torch/csrc/generic/StorageMethods.cpp
+++ b/torch/csrc/generic/StorageMethods.cpp
@ -1,3 +1,24 @@
+#include <torch/csrc/python_headers.h>
+#ifdef _MSC_VER
+#include <c10/util/win32-headers.h>
+#endif
+#include <structmember.h>
+
+#include <libshm.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/copy_utils.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/CudaIPCTypes.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/autograd/utils/wrap_outputs.h>
+#include <c10/core/CPUAllocator.h>
+
+#include <fmt/format.h>
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/StorageMethods.h>
+#include <torch/csrc/Storage.h>
+
 #include <ATen/ATen.h>
 #include <ATen/MapAllocator.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
@ -9,7 +30,6 @@
 #include <ATen/native/cuda/Resize.h>
 #endif

-#include <c10/core/CPUAllocator.h>
 #include <ATen/native/Resize.h>

 #ifdef _MSC_VER
@ -18,7 +38,7 @@
 #define LSEEK lseek
 #endif

-static PyObject * THPStorage_(nbytes)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_nbytes(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -26,15 +46,15 @@ static PyObject * THPStorage_(nbytes)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(dataPtr)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_dataPtr(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
-  return PyLong_FromVoidPtr(self->cdata->data<scalar_t>());
+  return PyLong_FromVoidPtr(self->cdata->data<uint8_t>());
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(copy_)(PyObject *self, PyObject *args, PyObject *kwargs)
+static PyObject * THPStorage_copy_(PyObject *self, PyObject *args, PyObject *kwargs)
 {
  HANDLE_TH_ERRORS

@ -59,27 +79,27 @@ static PyObject * THPStorage_(copy_)(PyObject *self, PyObject *args, PyObject *k
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(isPinned)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_isPinned(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
 #if defined(USE_CUDA)
-  return PyBool_FromLong(at::globalContext().isPinnedPtr(self->cdata->data<scalar_t>()));
+  return PyBool_FromLong(at::globalContext().isPinnedPtr(self->cdata->data<uint8_t>()));
 #else
  Py_RETURN_FALSE;
 #endif
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(elementSize)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_elementSize(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
-  return THPUtils_packInt64(sizeof(scalar_t));
+  return THPUtils_packInt64(sizeof(uint8_t));
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(new)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_new(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -91,11 +111,11 @@ static PyObject * THPStorage_(new)(PyObject *_self, PyObject *noargs)
    /*resizable=*/true);

  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  return THPStorage_(New)(std::move(new_storage));
+  return THPStorage_New(std::move(new_storage));
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(resize_)(PyObject *_self, PyObject *number_arg)
+static PyObject * THPStorage_resize_(PyObject *_self, PyObject *number_arg)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -123,22 +143,22 @@ static PyObject * THPStorage_(resize_)(PyObject *_self, PyObject *number_arg)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(fill_)(PyObject *_self, PyObject *number_arg)
+static PyObject * THPStorage_fill_(PyObject *_self, PyObject *number_arg)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
-  THPUtils_assert(THPUtils_(checkReal)(number_arg), "fill_ expects %s, "
-      "but got %s", THPUtils_typeTraits<scalar_t>::python_type_str,
+  THPUtils_assert(THPByteUtils_checkReal(number_arg), "fill_ expects %s, "
+      "but got %s", THPUtils_typeTraits<uint8_t>::python_type_str,
      THPUtils_typename(number_arg));
  storage_fill(
    at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
-    THPUtils_(unpackReal)(number_arg));
+    THPByteUtils_unpackReal(number_arg));
  Py_INCREF(self);
  return (PyObject*)self;
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyObject *keywds)
+static PyObject * THPStorage_fromBuffer(PyObject *_unused, PyObject *args, PyObject *keywds)
 {
  HANDLE_TH_ERRORS
  PyObject *obj = nullptr;
@ -276,11 +296,11 @@ static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyO
  }

  PyBuffer_Release(&buffer);
-  return (PyObject*)THPStorage_(New)(storage);
+  return (PyObject*)THPStorage_New(storage);
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(fromFile)(PyObject *_unused, PyObject *args, PyObject *keywds)
+static PyObject * THPStorage_fromFile(PyObject *_unused, PyObject *args, PyObject *keywds)
 {
  HANDLE_TH_ERRORS
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@ -309,11 +329,11 @@ static PyObject * THPStorage_(fromFile)(PyObject *_unused, PyObject *args, PyObj
    storage->set_nbytes(actual_nbytes);
  }

-  return (PyObject*)THPStorage_(New)(std::move(storage));
+  return (PyObject*)THPStorage_New(std::move(storage));
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(writeFile)(PyObject *_self, PyObject *args)
+PyObject * THPStorage_writeFile(PyObject *_self, PyObject *args)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -327,19 +347,19 @@ PyObject * THPStorage_(writeFile)(PyObject *_self, PyObject *args)
  uint64_t element_size = THPUtils_unpackUInt64(element_size_obj);

  if (!is_real_file) {
-    THPStorage_(writeFileRaw<PyObject*>)(self->cdata, file, save_size, element_size);
+    THPStorage_writeFileRaw<PyObject*>(self->cdata, file, save_size, element_size);
    Py_RETURN_NONE;
  }

  int fd = PyObject_AsFileDescriptor(file);
  THPUtils_assert(fd != -1, "_write_file couldn't retrieve a file descriptor "
      "from given object");
-  THPStorage_(writeFileRaw)(self->cdata, fd, save_size, element_size);
+  THPStorage_writeFileRaw(self->cdata, fd, save_size, element_size);
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(newWithFile)(PyObject *_unused, PyObject *args)
+PyObject * THPStorage_newWithFile(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
  TORCH_CHECK(PyTuple_Size(args) == 2,
@ -353,14 +373,14 @@ PyObject * THPStorage_(newWithFile)(PyObject *_unused, PyObject *args)
                  "_new_with_file: need to specify element size");
  uint64_t element_size = THPUtils_unpackUInt64(element_size_obj);

-  auto storage = THPStorage_(readFileRaw<int>)(fd, {}, element_size);
+  auto storage = THPStorage_readFileRaw<int>(fd, {}, element_size);
  if (!storage.defined())
    return nullptr;
-  return THPStorage_(New)(std::move(storage));
+  return THPStorage_New(std::move(storage));
  END_HANDLE_TH_ERRORS
 }

-static PyObject *THPStorage_(setFromFile)(PyObject *_self, PyObject *args)
+static PyObject *THPStorage_setFromFile(PyObject *_self, PyObject *args)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -381,7 +401,7 @@ static PyObject *THPStorage_(setFromFile)(PyObject *_self, PyObject *args)
                    "_set_from_file: offset is NYI for filelike objects");

    auto self_storage = c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(self->cdata);
-    auto storage = THPStorage_(readFileRaw<PyObject*>)(file, std::move(self_storage), element_size);
+    auto storage = THPStorage_readFileRaw<PyObject*>(file, std::move(self_storage), element_size);
    if (!storage.defined()) {
      return nullptr;
    }
@ -398,7 +418,7 @@ static PyObject *THPStorage_(setFromFile)(PyObject *_self, PyObject *args)
  THPUtils_assert(fd != -1, "_set_from_file couldn't retrieve a file "
      "descriptor from given object");
  auto self_storage = c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(self->cdata);
-  auto storage = THPStorage_(readFileRaw<int>)(fd, self_storage, element_size);
+  auto storage = THPStorage_readFileRaw<int>(fd, self_storage, element_size);
  if (!storage.defined())
    return nullptr;
  Py_INCREF(self);
@ -418,7 +438,7 @@ static PyObject *THPStorage_(setFromFile)(PyObject *_self, PyObject *args)
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(_setCdata)(PyObject *_self, PyObject *new_cdata)
+PyObject * THPStorage__setCdata(PyObject *_self, PyObject *new_cdata)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -439,23 +459,27 @@ PyObject * THPStorage_(_setCdata)(PyObject *_self, PyObject *new_cdata)
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static PyMethodDef THPStorage_(methods)[] = {
-  {"copy_", castPyCFunctionWithKeywords(THPStorage_(copy_)),
+static PyMethodDef THPStorage_methods[] = {
+  {"copy_", castPyCFunctionWithKeywords(THPStorage_copy_),
    METH_VARARGS | METH_KEYWORDS, nullptr},
-  {"element_size", THPStorage_(elementSize), METH_NOARGS, nullptr},
-  {"fill_", THPStorage_(fill_), METH_O, nullptr},
-  {"new", THPStorage_(new), METH_NOARGS, nullptr},
-  {"resize_", THPStorage_(resize_), METH_O, nullptr},
-  {"nbytes", THPStorage_(nbytes), METH_NOARGS, nullptr},
-  {"data_ptr", THPStorage_(dataPtr), METH_NOARGS, nullptr},
-  {"is_pinned", THPStorage_(isPinned), METH_NOARGS, nullptr},
-  {"_write_file", THPStorage_(writeFile), METH_VARARGS, nullptr},
-  {"_new_with_file", THPStorage_(newWithFile), METH_VARARGS | METH_STATIC, nullptr},
-  {"_set_from_file", THPStorage_(setFromFile), METH_VARARGS, nullptr},
-  {"from_buffer", castPyCFunctionWithKeywords(THPStorage_(fromBuffer)),
+  {"element_size", THPStorage_elementSize, METH_NOARGS, nullptr},
+  {"fill_", THPStorage_fill_, METH_O, nullptr},
+  {"new", THPStorage_new, METH_NOARGS, nullptr},
+  {"resize_", THPStorage_resize_, METH_O, nullptr},
+  {"nbytes", THPStorage_nbytes, METH_NOARGS, nullptr},
+  {"data_ptr", THPStorage_dataPtr, METH_NOARGS, nullptr},
+  {"is_pinned", THPStorage_isPinned, METH_NOARGS, nullptr},
+  {"_write_file", THPStorage_writeFile, METH_VARARGS, nullptr},
+  {"_new_with_file", THPStorage_newWithFile, METH_VARARGS | METH_STATIC, nullptr},
+  {"_set_from_file", THPStorage_setFromFile, METH_VARARGS, nullptr},
+  {"from_buffer", castPyCFunctionWithKeywords(THPStorage_fromBuffer),
    METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
-  {"from_file", castPyCFunctionWithKeywords(THPStorage_(fromFile)),
+  {"from_file", castPyCFunctionWithKeywords(THPStorage_fromFile),
    METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
-  {"_set_cdata", THPStorage_(_setCdata), METH_O, nullptr},
+  {"_set_cdata", THPStorage__setCdata, METH_O, nullptr},
  {nullptr}
 };
+
+PyMethodDef* THPStorage_getMethods() {
+  return THPStorage_methods;
+}
--- a/torch/csrc/StorageMethods.h
+++ b/torch/csrc/StorageMethods.h
@ -0,0 +1,8 @@
+#ifndef THP_STORAGE_METHODS_INC
+#define THP_STORAGE_METHODS_INC
+
+#include <Python.h>
+
+PyMethodDef* THPStorage_getMethods();
+
+#endif
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@ -1,3 +1,24 @@
+#include <torch/csrc/python_headers.h>
+#ifdef _MSC_VER
+#include <c10/util/win32-headers.h>
+#endif
+#include <structmember.h>
+
+#include <libshm.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/copy_utils.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/CudaIPCTypes.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/autograd/utils/wrap_outputs.h>
+#include <c10/core/CPUAllocator.h>
+
+#include <fmt/format.h>
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/StorageSharing.h>
+#include <torch/csrc/Storage.h>
+
 #ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@ -9,7 +30,7 @@
 #include <atomic>
 #include <string>

-static PyObject * THPStorage_(sharedDecref)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_sharedDecref(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -26,7 +47,7 @@ static PyObject * THPStorage_(sharedDecref)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(sharedIncref)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_sharedIncref(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -42,7 +63,7 @@ static PyObject * THPStorage_(sharedIncref)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject *args)
+static PyObject * THPStorage_pyNewFilenameStorage(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@ -53,7 +74,7 @@ static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject

  int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE;
  std::string handle = at::NewProcessWideShmHandle();
-  return THPStorage_(New)(c10::make_intrusive<at::StorageImpl>(
+  return THPStorage_New(c10::make_intrusive<at::StorageImpl>(
    c10::StorageImpl::use_byte_size_t(),
    size,
    THManagedMapAllocator::makeDataPtr("", handle.c_str(), flags, size),
@ -62,7 +83,7 @@ static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(shareFilename)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_shareFilename(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  TORCH_CHECK(reinterpret_cast<THPStorage*>(_self)->cdata->device_type() == at::kCPU,
@ -98,7 +119,7 @@ static PyObject * THPStorage_(shareFilename)(PyObject *_self, PyObject *noargs)
  if (!manager_handle) return nullptr;
  THPObjectPtr storage_handle(PyBytes_FromString(ctx->filename()));
  if (!storage_handle) return nullptr;
-  THPObjectPtr size(THPUtils_packUInt64(storage->nbytes() / sizeof(scalar_t)));
+  THPObjectPtr size(THPUtils_packUInt64(storage->nbytes() / sizeof(uint8_t)));
  if (!size) return nullptr;

  THPObjectPtr tuple(PyTuple_New(3));
@ -110,7 +131,7 @@ static PyObject * THPStorage_(shareFilename)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *args)
+static PyObject * THPStorage_newSharedFilename(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
  THPUtils_assert(PyTuple_GET_SIZE(args) == 3, "tuple of 3 items expected");
@ -127,7 +148,7 @@ static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *ar
  int64_t size = THPUtils_unpackLong(_size);
  int flags = at::ALLOCATOR_MAPPED_SHAREDMEM |
              at::ALLOCATOR_MAPPED_NOCREATE;
-  return THPStorage_(New)(
+  return THPStorage_New(
    c10::make_intrusive<at::StorageImpl>(
      c10::StorageImpl::use_byte_size_t(),
      size,
@ -137,14 +158,14 @@ static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *ar
  END_HANDLE_TH_ERRORS
 }

-static c10::intrusive_ptr<c10::StorageImpl> THPStorage_(newFdStorage)(ptrdiff_t size)
+static c10::intrusive_ptr<c10::StorageImpl> THPStorage_newFdStorage(ptrdiff_t size)
 {
  int flags = at::ALLOCATOR_MAPPED_SHAREDMEM |
              at::ALLOCATOR_MAPPED_EXCLUSIVE |
              at::ALLOCATOR_MAPPED_KEEPFD |
              at::ALLOCATOR_MAPPED_UNLINK;
  std::string handle = at::NewProcessWideShmHandle();
-  auto sptr = at::MapAllocator::makeDataPtr(handle.c_str(), flags, size * sizeof(scalar_t), nullptr);
+  auto sptr = at::MapAllocator::makeDataPtr(handle.c_str(), flags, size * sizeof(uint8_t), nullptr);
  return c10::make_intrusive<at::StorageImpl>(
    c10::StorageImpl::use_byte_size_t(),
    size,
@ -153,7 +174,7 @@ static c10::intrusive_ptr<c10::StorageImpl> THPStorage_(newFdStorage)(ptrdiff_t
    /*resizable=*/false);
 }

-static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args)
+static PyObject * THPStorage_pyNewFdStorage(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@ -161,11 +182,11 @@ static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args)
  if (!PyArg_ParseTuple(args, "L", &size)) {
    return nullptr;
  }
-  return THPStorage_(New)(THPStorage_(newFdStorage)(size));
+  return THPStorage_New(THPStorage_newFdStorage(size));
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(shareFd)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_shareFd(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  TORCH_CHECK(reinterpret_cast<THPStorage*>(_self)->cdata->device_type() == at::kCPU,
@ -178,7 +199,7 @@ static PyObject * THPStorage_(shareFd)(PyObject *_self, PyObject *noargs)
  if ((ctx = at::MapAllocator::fromDataPtr(storage->data_ptr()))) {
    // done
  } else {
-    at::Storage new_storage(THPStorage_(newFdStorage)(storage->nbytes()));
+    at::Storage new_storage(THPStorage_newFdStorage(storage->nbytes()));
    at::Storage _self_aten = torch::createStorage(_self);
    storage_copy(new_storage, _self_aten);

@ -189,7 +210,7 @@ static PyObject * THPStorage_(shareFd)(PyObject *_self, PyObject *noargs)

  THPObjectPtr storage_handle(THPUtils_packInt32(ctx->fd()));
  if (!storage_handle) return nullptr;
-  THPObjectPtr size(THPUtils_packUInt64(storage->nbytes() / sizeof(scalar_t)));
+  THPObjectPtr size(THPUtils_packUInt64(storage->nbytes() / sizeof(uint8_t)));
  if (!size) return nullptr;

  THPObjectPtr tuple(PyTuple_New(2));
@ -200,7 +221,7 @@ static PyObject * THPStorage_(shareFd)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
+static PyObject * THPStorage_newSharedFd(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
  THPUtils_assert(PyTuple_GET_SIZE(args) == 2, "tuple of 2 items expected");
@ -224,7 +245,7 @@ static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
              at::ALLOCATOR_MAPPED_NOCREATE |
              at::ALLOCATOR_MAPPED_KEEPFD |
              at::ALLOCATOR_MAPPED_FROMFD;
-  return THPStorage_(New)(
+  return THPStorage_New(
    c10::make_intrusive<at::StorageImpl>(
      c10::StorageImpl::use_byte_size_t(),
      size,
@ -234,7 +255,7 @@ static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(shareCuda)(PyObject *_self, PyObject *noargs)
+static PyObject * THPStorage_shareCuda(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
 #ifdef USE_CUDA
@ -262,11 +283,11 @@ static PyObject * THPStorage_(shareCuda)(PyObject *_self, PyObject *noargs)
  Py_INCREF(Py_None);
  THPObjectPtr _event_sync_required(Py_None);
  Py_INCREF(Py_None);
-  if (storage->data<scalar_t>()) {
+  if (storage->data<uint8_t>()) {
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    size_t base_size;
-    void *base_ptr = c10::cuda::CUDACachingAllocator::getBaseAllocation(storage->data<scalar_t>(), &base_size);
-    ptrdiff_t offset_bytes = (char*)storage->data<scalar_t>() - (char*)base_ptr;
+    void *base_ptr = c10::cuda::CUDACachingAllocator::getBaseAllocation(storage->data<uint8_t>(), &base_size);
+    ptrdiff_t offset_bytes = (char*)storage->data<uint8_t>() - (char*)base_ptr;

    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    cudaIpcMemHandle_t handle;
@ -320,7 +341,7 @@ static PyObject * THPStorage_(shareCuda)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-static PyObject * THPStorage_(releaseIPCCounter)(PyObject *_unused, PyObject *args)
+static PyObject * THPStorage_releaseIPCCounter(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
 #ifdef USE_CUDA
@ -363,7 +384,7 @@ static PyObject * THPStorage_(releaseIPCCounter)(PyObject *_unused, PyObject *ar
 }

 #ifdef USE_CUDA
-static std::string THPStorage_(bytesAsHandleString)(PyObject *handle) {
+static std::string THPStorage_bytesAsHandleString(PyObject *handle) {
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  char* buffer;
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@ -379,7 +400,7 @@ static std::string THPStorage_(bytesAsHandleString)(PyObject *handle) {
 }
 #endif

-static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
+static PyObject * THPStorage_newSharedCuda(PyObject *_unused, PyObject *args)
 {
  HANDLE_TH_ERRORS
 #ifdef USE_CUDA
@ -405,7 +426,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
    return nullptr;
  }

-  size_t storage_size = (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(scalar_t);
+  size_t storage_size = (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(uint8_t);
  ptrdiff_t storage_offset_bytes = (ptrdiff_t)THPUtils_unpackLong(_offset_bytes);

  int64_t device = THPUtils_unpackLong(_device);
@ -414,7 +435,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
  if (PyObject_IsTrue(_event_sync_required)) {
    // Ensure that producer prepared all tensor's data
    std::string s_ipc_event_handle =
-        THPStorage_(bytesAsHandleString)(_event_handle);
+        THPStorage_bytesAsHandleString(_event_handle);
    auto ipc_event_handle = reinterpret_cast<const cudaIpcEventHandle_t*>(
        s_ipc_event_handle.c_str());
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@ -424,7 +445,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
        cudaStreamWaitEvent(c10::cuda::getCurrentCUDAStream(device), event, 0));
  }

-  std::string s_handle = THPStorage_(bytesAsHandleString)(_handle);
+  std::string s_handle = THPStorage_bytesAsHandleString(_handle);
  std::shared_ptr<void> basePtr = c10::cuda::CUDACachingAllocator::getIpcDevPtr(s_handle);

  // Offset the basePtr to reconstruct the real storage
@ -499,7 +520,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
  base->set_resizable(false);
  base->set_received_cuda(true);

-  return THPStorage_(New)(std::move(base));
+  return THPStorage_New(std::move(base));
 #else
  TORCH_CHECK(false, "CUDA is not available");
 #endif
@ -511,7 +532,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
 // pointer.
 //
 // NB: This does NOT preserve object identity when you call it multiple times
-static PyObject * THPStorage_(weakRef)(PyObject *_self, PyObject *args) {
+static PyObject * THPStorage_weakRef(PyObject *_self, PyObject *args) {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
  c10::StorageImpl* storage = self->cdata;
@ -519,20 +540,20 @@ static PyObject * THPStorage_(weakRef)(PyObject *_self, PyObject *args) {
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg)
+PyObject * THPStorage_newWithWeakPtr(PyObject *_unused, PyObject *arg)
 {
  HANDLE_TH_ERRORS
  THPUtils_assert(THPUtils_checkLong(arg),
      "_new_with_weak_ptr(): arg must be an 'int'");
  c10::StorageImpl *weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
  if (auto* storage = c10::raw::weak_intrusive_ptr::lock(weak_storage)) {
-    return THPStorage_(New)(c10::intrusive_ptr<c10::StorageImpl>::reclaim(storage));
+    return THPStorage_New(c10::intrusive_ptr<c10::StorageImpl>::reclaim(storage));
  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(freeWeakRef)(PyObject *_unused, PyObject *arg)
+PyObject * THPStorage_freeWeakRef(PyObject *_unused, PyObject *arg)
 {
  HANDLE_TH_ERRORS
  if (arg == Py_None) {
@ -547,7 +568,7 @@ PyObject * THPStorage_(freeWeakRef)(PyObject *_unused, PyObject *arg)
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(expired)(PyObject *_unused, PyObject *arg)
+PyObject * THPStorage_expired(PyObject *_unused, PyObject *arg)
 {
  HANDLE_TH_ERRORS
  THPUtils_assert(THPUtils_checkLong(arg), "_expired(): arg must be an 'int'");
@ -556,7 +577,7 @@ PyObject * THPStorage_(expired)(PyObject *_unused, PyObject *arg)
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(sharedFd)(PyObject *_self, PyObject *noargs)
+PyObject * THPStorage_sharedFd(PyObject *_self, PyObject *noargs)
 {
  HANDLE_TH_ERRORS
  auto self = (THPStorage*)_self;
@ -571,7 +592,7 @@ PyObject * THPStorage_(sharedFd)(PyObject *_self, PyObject *noargs)
  END_HANDLE_TH_ERRORS
 }

-PyObject * THPStorage_(isShared)(PyObject *_self, PyObject *noargs)
+PyObject * THPStorage_isShared(PyObject *_self, PyObject *noargs)
 {
  auto self = (THPStorage*)_self;
  if (self->cdata->device_type() == at::kCUDA) {
@ -586,23 +607,27 @@ PyObject * THPStorage_(isShared)(PyObject *_self, PyObject *noargs)
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static PyMethodDef THPStorage_(sharingMethods)[] = {
-  {"_new_with_weak_ptr", THPStorage_(newWithWeakPtr), METH_O | METH_CLASS, nullptr},
-  {"_share_cuda_", THPStorage_(shareCuda), METH_NOARGS, nullptr},
-  {"_new_shared_cuda", THPStorage_(newSharedCuda), METH_VARARGS | METH_STATIC, nullptr},
-  {"_release_ipc_counter_cuda", THPStorage_(releaseIPCCounter), METH_VARARGS | METH_STATIC, nullptr},
-  {"_share_fd_cpu_", THPStorage_(shareFd), METH_NOARGS, nullptr},
-  {"_new_shared_fd_cpu", THPStorage_(newSharedFd), METH_VARARGS | METH_STATIC, nullptr},
-  {"_new_using_fd_cpu", THPStorage_(pyNewFdStorage), METH_VARARGS | METH_STATIC, nullptr},
-  {"_share_filename_cpu_", THPStorage_(shareFilename), METH_NOARGS, nullptr},
-  {"_new_shared_filename_cpu", THPStorage_(newSharedFilename), METH_VARARGS | METH_STATIC, nullptr},
-  {"_new_using_filename_cpu", THPStorage_(pyNewFilenameStorage), METH_VARARGS | METH_STATIC, nullptr},
-  {"_weak_ref", THPStorage_(weakRef), METH_NOARGS, nullptr},
-  {"_free_weak_ref", THPStorage_(freeWeakRef), METH_O | METH_STATIC, nullptr},
-  {"_expired", THPStorage_(expired), METH_O | METH_STATIC, nullptr},
-  {"_shared_decref", THPStorage_(sharedDecref), METH_NOARGS, nullptr},
-  {"_shared_incref", THPStorage_(sharedIncref), METH_NOARGS, nullptr},
-  {"_get_shared_fd", THPStorage_(sharedFd), METH_NOARGS, nullptr},
-  {"is_shared", THPStorage_(isShared), METH_NOARGS, nullptr},
+static PyMethodDef THPStorage_sharingMethods[] = {
+  {"_new_with_weak_ptr", THPStorage_newWithWeakPtr, METH_O | METH_CLASS, nullptr},
+  {"_share_cuda_", THPStorage_shareCuda, METH_NOARGS, nullptr},
+  {"_new_shared_cuda", THPStorage_newSharedCuda, METH_VARARGS | METH_STATIC, nullptr},
+  {"_release_ipc_counter_cuda", THPStorage_releaseIPCCounter, METH_VARARGS | METH_STATIC, nullptr},
+  {"_share_fd_cpu_", THPStorage_shareFd, METH_NOARGS, nullptr},
+  {"_new_shared_fd_cpu", THPStorage_newSharedFd, METH_VARARGS | METH_STATIC, nullptr},
+  {"_new_using_fd_cpu", THPStorage_pyNewFdStorage, METH_VARARGS | METH_STATIC, nullptr},
+  {"_share_filename_cpu_", THPStorage_shareFilename, METH_NOARGS, nullptr},
+  {"_new_shared_filename_cpu", THPStorage_newSharedFilename, METH_VARARGS | METH_STATIC, nullptr},
+  {"_new_using_filename_cpu", THPStorage_pyNewFilenameStorage, METH_VARARGS | METH_STATIC, nullptr},
+  {"_weak_ref", THPStorage_weakRef, METH_NOARGS, nullptr},
+  {"_free_weak_ref", THPStorage_freeWeakRef, METH_O | METH_STATIC, nullptr},
+  {"_expired", THPStorage_expired, METH_O | METH_STATIC, nullptr},
+  {"_shared_decref", THPStorage_sharedDecref, METH_NOARGS, nullptr},
+  {"_shared_incref", THPStorage_sharedIncref, METH_NOARGS, nullptr},
+  {"_get_shared_fd", THPStorage_sharedFd, METH_NOARGS, nullptr},
+  {"is_shared", THPStorage_isShared, METH_NOARGS, nullptr},
  {nullptr}
 };
+
+PyMethodDef* THPStorage_getSharingMethods() {
+  return THPStorage_sharingMethods;
+}
--- a/torch/csrc/StorageSharing.h
+++ b/torch/csrc/StorageSharing.h
@ -0,0 +1,8 @@
+#ifndef THP_STORAGE_SHARING_INC
+#define THP_STORAGE_SHARING_INC
+
+#include <Python.h>
+
+PyMethodDef* THPStorage_getSharingMethods();
+
+#endif
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@ -1,366 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "torch/csrc/generic/Storage.cpp"
-#else
-#include <torch/csrc/utils/python_arg_parser.h>
-
-PyObject *THPStorageClass = nullptr;
-
-PyObject * THPStorage_(New)(c10::intrusive_ptr<c10::StorageImpl> ptr)
-{
-  AT_ASSERT(ptr);
-  PyTypeObject *type = (PyTypeObject *)THPStorageClass;
-  PyObject *obj = type->tp_alloc(type, 0);
-  if (obj) {
-    ((THPStorage *)obj)->cdata = ptr.release();
-  }
-  return obj;
-}
-
-static void THPStorage_(dealloc)(THPStorage* self)
-{
-  if (self->cdata) {
-    c10::raw::intrusive_ptr::decref(self->cdata);
-  }
-  Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  HANDLE_TH_ERRORS
-
-  static torch::PythonArgParser parser({
-      THPStorageStr "(*, int64_t allocator=None, Device device=None)",
-      THPStorageStr "(int64_t size, *, int64_t allocator=None, Device device=None)",
-      THPStorageStr "(PyObject* sequence, *, int64_t allocator=None, Device device=None)",
-  });
-  torch::ParsedArgs<3> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  int64_t allocator_arg_idx = 0;
-  int64_t device_arg_idx = 1;
-
-  if (r.idx > 0) {
-    allocator_arg_idx = 1;
-    device_arg_idx = 2;
-  }
-
-  c10::optional<int64_t> allocator_opt = r.toInt64Optional(allocator_arg_idx);
-  c10::optional<at::Device> device_opt = r.deviceOptional(device_arg_idx);
-
-  TORCH_CHECK(!allocator_opt.has_value() || !device_opt.has_value(),
-    THPStorageStr, "(): only one or neither of 'allocator' or 'device' can ",
-    "be given, but not both");
-
-  THPStoragePtr self((THPStorage *)type->tp_alloc(type, 0));
-  THPUtils_assert(self, "failed to allocate a " THPStorageStr " object");
-  c10::Allocator* allocator = nullptr;
-  at::OptionalDeviceGuard device_guard;
-
-  if (allocator_opt.has_value()) {
-    allocator = reinterpret_cast<c10::Allocator*>(allocator_opt.value());
-  } else if (device_opt.has_value()) {
-    at::Device device = device_opt.value();
-    if (device.type() == at::kCPU) {
-      allocator = c10::GetDefaultCPUAllocator();
-#ifdef USE_CUDA
-    } else if (device.type() == at::kCUDA) {
-      at::globalContext().lazyInitCUDA();
-      allocator = c10::cuda::CUDACachingAllocator::get();
-#endif
-    } else if (device.type() == at::DeviceType::Meta) {
-      allocator = c10::GetAllocator(device.type());
-    } else {
-      TORCH_CHECK(false,
-        THPStorageStr, "(): Storage device not recognized: ", device.type());
-    }
-    device_guard.reset_device(device);
-  } else {
-    allocator = c10::GetDefaultCPUAllocator();
-  }
-
-  // torch.Storage(*, ...)
-  if (r.idx == 0) {
-    self->cdata = c10::make_intrusive<at::StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      0,
-      allocator,
-      /*resizable=*/true).release();
-    return (PyObject*)self.release();
-
-  // torch.Storage(size, *, ...)
-  } else if (r.idx == 1) {
-    int64_t size = r.toInt64(0);
-    self->cdata = c10::make_intrusive<at::StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      size,
-      allocator,
-      /*resizable=*/true).release();
-    return (PyObject*)self.release();
-
-  // torch.Storage(sequence, *, ...)
-  } else if (r.idx == 2) {
-    PyObject *sequence = r.pyobject(0);
-    Py_ssize_t length = PySequence_Length(sequence);
-    TORCH_CHECK(PySequence_Check(sequence),
-      THPStorageStr, "(): Expected a sequence type, but got ",
-      THPUtils_typename(sequence));
-    TORCH_CHECK(length >= 0,
-      THPStorageStr, "(): Could not obtain the length of sequence of type ",
-      THPUtils_typename(sequence));
-    self->cdata = c10::make_intrusive<at::StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      length,
-      allocator,
-      /*resizable=*/true)
-      .release();
-    THPObjectPtr item;
-    try {
-      for (Py_ssize_t i = 0; i < length; i++) {
-        item = PySequence_GetItem(sequence, i);
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        scalar_t value = THPUtils_(unpackReal)(item.get());
-        if (allocator == c10::GetDefaultCPUAllocator()) {
-          self->cdata->unsafe_data<scalar_t>()[i] = value;
-        } else {
-          // TODO: this might be slow - consider batched updates?
-          storage_set(
-            at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
-            i,
-            value);
-        }
-      }
-    } catch (const std::exception &e) {
-      THPUtils_setError(THPStorageStr
-          "(): tried to construct a storage from a sequence (%s), "
-          "but one of the items was of type %s instead of %s",
-          THPUtils_typename(sequence),
-          THPUtils_typename(item.get()),
-          THPUtils_typeTraits<scalar_t>::python_type_str);
-      return nullptr;
-    }
-    return (PyObject*)self.release();
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-static Py_ssize_t THPStorage_(length)(THPStorage *self)
-{
-  HANDLE_TH_ERRORS
-  return self->cdata->nbytes() / sizeof(scalar_t);
-  END_HANDLE_TH_ERRORS_RET(-1)
-}
-
-static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
-{
-  HANDLE_TH_ERRORS
-  /* Integer index */
-  if (THPUtils_checkLong(index)) {
-    int64_t nindex = THPUtils_unpackLong(index);
-    if (nindex < 0)
-      nindex += (self->cdata->nbytes() / sizeof(scalar_t));
-    if (nindex < 0 || nindex >= static_cast<int64_t>(self->cdata->nbytes() / sizeof(scalar_t))) {
-      PyErr_SetString(PyExc_IndexError, fmt::format(
-            "index {} out of range for storage of size {}",
-            nindex, self->cdata->nbytes() / sizeof(scalar_t)));
-      return nullptr;
-    }
-    scalar_t value = storage_get(at::unsafeStorageFromTH(self->cdata, /*retain=*/true), nindex);
-    return THPUtils_(newReal)(value);
-  /* Slice index */
-  } else if (PySlice_Check(index)) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    Py_ssize_t start, stop, slicelength, step;
-    int64_t len = self->cdata->nbytes() / sizeof(scalar_t);
-    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
-      return nullptr;
-    if (step != 1) {
-      THPUtils_setError("Trying to slice with a step of %lld, but only a step of "
-          "1 is supported", (long long)step);
-      return nullptr;
-    }
-
-    scalar_t *data = self->cdata->data<scalar_t>();
-
-    at::StorageImpl* old_storage = self->cdata;
-    c10::raw::intrusive_ptr::incref(old_storage);
-    auto new_storage = c10::make_intrusive<at::StorageImpl>(
-        c10::StorageImpl::use_byte_size_t(),
-#ifdef THQUANTIZED
-        slicelength * sizeof(quantized_t),
-#else
-        slicelength * sizeof(scalar_t),
-#endif
-        at::DataPtr(
-            static_cast<void*>(data + start),
-            old_storage,
-            [](void* s) {
-              c10::raw::intrusive_ptr::decref(static_cast<at::StorageImpl*>(s));
-            },
-            old_storage->device()),
-        old_storage->allocator(),
-        /* resizable */ false);
-
-    PyObject *_ret = THPStorage_(New)(std::move(new_storage));
-    return _ret;
-  }
-  PyErr_Format(PyExc_TypeError, "can't index a " THPStorageStr " with %s",
-      THPUtils_typename(index));
-  return nullptr;
-  END_HANDLE_TH_ERRORS
-}
-
-static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value)
-{
-  HANDLE_TH_ERRORS
-  if (!THPUtils_(checkReal)(value)) {
-    THPUtils_setError("can only set storage content with a %s, but got "
-        "%s instead", THPUtils_typeTraits<scalar_t>::python_type_str,
-        THPUtils_typename(value));
-    return -1;
-  }
-
-  scalar_t rvalue = THPUtils_(unpackReal)(value);
-  if (THPUtils_checkLong(index)) {
-    int64_t nindex = THPUtils_unpackLong(index);
-    storage_set(
-      at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
-      nindex,
-      rvalue);
-    return 0;
-  } else if (PySlice_Check(index)) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    Py_ssize_t start, stop, slicelength, step;
-    int64_t len = self->cdata->nbytes() / sizeof(scalar_t);
-    if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
-      return -1;
-    if (step != 1) {
-      THPUtils_setError("Trying to slice with a step of %lld, but only a step of "
-          "1 is supported", (long long)step);
-      return 0;
-    }
-    // TODO: check the bounds only once
-    // TODO: fill?
-    for (;start < stop; start++)
-      storage_set(
-        at::unsafeStorageFromTH(self->cdata, /*retain=*/true),
-        start,
-        rvalue);
-    return 0;
-  }
-  THPUtils_setError("can't index a " THPStorageStr " with %s",
-      THPUtils_typename(index));
-  return -1;
-  END_HANDLE_TH_ERRORS_RET(-1)
-}
-
-static PyMappingMethods THPStorage_(mappingmethods) = {
-  (lenfunc)THPStorage_(length),
-  (binaryfunc)THPStorage_(get),
-  (objobjargproc)THPStorage_(set)
-};
-
-// TODO: implement equality
-PyTypeObject THPStorageType = {
-  PyVarObject_HEAD_INIT(nullptr, 0)
-  "torch._C." THPStorageBaseStr,               /* tp_name */
-  sizeof(THPStorage),                          /* tp_basicsize */
-  0,                                           /* tp_itemsize */
-  (destructor)THPStorage_(dealloc),            /* tp_dealloc */
-  0,                                           /* tp_vectorcall_offset */
-  nullptr,                                     /* tp_getattr */
-  nullptr,                                     /* tp_setattr */
-  nullptr,                                     /* tp_reserved */
-  nullptr,                                     /* tp_repr */
-  nullptr,                                     /* tp_as_number */
-  nullptr,                                     /* tp_as_sequence */
-  &THPStorage_(mappingmethods),                /* tp_as_mapping */
-  nullptr,                                     /* tp_hash  */
-  nullptr,                                     /* tp_call */
-  nullptr,                                     /* tp_str */
-  nullptr,                                     /* tp_getattro */
-  nullptr,                                     /* tp_setattro */
-  nullptr,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,    /* tp_flags */
-  nullptr,                                     /* tp_doc */
-  nullptr,                                     /* tp_traverse */
-  nullptr,                                     /* tp_clear */
-  nullptr,                                     /* tp_richcompare */
-  0,                                           /* tp_weaklistoffset */
-  nullptr,                                     /* tp_iter */
-  nullptr,                                     /* tp_iternext */
-  nullptr,   /* will be assigned in init */    /* tp_methods */
-  nullptr,   /* will be assigned in init */    /* tp_members */
-  nullptr,                                     /* tp_getset */
-  nullptr,                                     /* tp_base */
-  nullptr,                                     /* tp_dict */
-  nullptr,                                     /* tp_descr_get */
-  nullptr,                                     /* tp_descr_set */
-  0,                                           /* tp_dictoffset */
-  nullptr,                                     /* tp_init */
-  nullptr,                                     /* tp_alloc */
-  THPStorage_(pynew),                          /* tp_new */
-};
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static struct PyMemberDef THPStorage_(members)[] = {
-  {(char*)"_cdata", T_ULONGLONG, offsetof(THPStorage, cdata), READONLY, nullptr},
-  {nullptr}
-};
-
-static PyObject * THPStorage_(device)(THPStorage* self, void *unused) {
-  HANDLE_TH_ERRORS
-  return THPDevice_New(self->cdata->device());
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPStorage_(dtype)(THPStorage *self, void *unused)
-{
-  HANDLE_TH_ERRORS
-  return torch::autograd::utils::wrap(
-      torch::getTHPDtype(at::typeMetaToScalarType(
-#ifdef THQUANTIZED
-          caffe2::TypeMeta::Make<quantized_t>()
-#else
-          caffe2::TypeMeta::Make<scalar_t>()
-#endif
-              )));
-  END_HANDLE_TH_ERRORS
-}
-
-typedef PyObject *(*getter)(PyObject *, void *);
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static struct PyGetSetDef THPStorage_(properties)[] = {
-  {"device", (getter)THPStorage_(device), nullptr, nullptr, nullptr},
-  {nullptr}
-};
-
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <torch/csrc/generic/StorageMethods.cpp>
-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <torch/csrc/generic/StorageSharing.cpp>
-
-bool THPStorage_(init)(PyObject *module)
-{
-  static std::vector<PyMethodDef> methods;
-  THPUtils_addPyMethodDefs(methods, THPStorage_(methods));
-  THPUtils_addPyMethodDefs(methods, THPStorage_(sharingMethods));
-
-  THPStorageType.tp_methods = methods.data();
-  THPStorageType.tp_members = THPStorage_(members);
-  THPStorageType.tp_getset = THPStorage_(properties);
-  if (PyType_Ready(&THPStorageType) < 0)
-    return false;
-  Py_INCREF(&THPStorageType);
-  PyModule_AddObject(module, THPStorageBaseStr, (PyObject *)&THPStorageType);
-  return true;
-}
-
-void THPStorage_(postInit)(PyObject *module)
-{
-  THPStorageClass = PyObject_GetAttrString(module, "_UntypedStorage");
-  if (!THPStorageClass) throw python_error();
-}
-
-#endif
--- a/torch/csrc/generic/Storage.h
+++ b/torch/csrc/generic/Storage.h
@ -1,17 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "torch/csrc/generic/Storage.h"
-#else
-
-#include <torch/csrc/StorageDefs.h>
-
-TORCH_PYTHON_API PyObject * THPStorage_(New)(c10::intrusive_ptr<c10::StorageImpl> ptr);
-extern PyObject *THPStorageClass;
-
-#include <torch/csrc/Types.h>
-
-bool THPStorage_(init)(PyObject *module);
-void THPStorage_(postInit)(PyObject *module);
-
-extern PyTypeObject THPStorageType;
-
-#endif
--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@ -1,188 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "torch/csrc/generic/serialization.cpp"
-#else
-
-#include <c10/core/CPUAllocator.h>
-
-// save_save is necessary since the old eager format saved storages as
-// [size + data], but the v1.5 eager format removes this since size is saved in
-// the filesize.
-template <class io>
-void THPStorage_(writeFileRaw)(c10::StorageImpl *self, io fd, bool save_size, uint64_t element_size)
-{
-  c10::DeviceGuard guard(self->device());
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  scalar_t *data;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::unique_ptr<char[]> cpu_data;
-  int64_t size_bytes = self->nbytes();
-  int64_t numel = size_bytes / element_size;
-  if (self->device_type() == at::kCPU) {
-    data = self->data<scalar_t>();
-#ifdef USE_CUDA
-  } else if (self->device_type() == at::kCUDA) {
-    cpu_data = std::unique_ptr<char[]>(new char[size_bytes]);
-    data = (scalar_t*)cpu_data.get();
-    C10_CUDA_CHECK(cudaMemcpy(
-        data,
-        self->data<scalar_t>(),
-        size_bytes,
-        cudaMemcpyDeviceToHost));
-#endif
-  } else {
-    TORCH_CHECK(false, "writeFileRaw: Device not recognized: ", self->device_type());
-  }
-  if (save_size) {
-    if (torch::utils::THP_nativeByteOrder() ==
-        torch::utils::THPByteOrder::THP_LITTLE_ENDIAN)
-      doWrite(fd, &numel, sizeof(int64_t));
-    else {
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int64_t nsize; // convert big endian cpu to little endian storage
-      torch::utils::THP_encodeInt64Buffer(
-          (uint8_t*)&nsize,
-          (const int64_t*)&numel,
-          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
-          1);
-      doWrite(fd, &nsize, sizeof(int64_t));
-    }
-  }
-  // fast track for bytes and little endian
-  if (element_size == 1 ||
-      torch::utils::THP_nativeByteOrder() ==
-          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
-    doWrite(fd, data, size_bytes);
-  } else {
-    int64_t buffer_size = std::min(numel, (int64_t)5000);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * element_size]);
-    for (int64_t i = 0; i < numel; i += buffer_size) {
-      size_t to_convert = std::min(numel - i, buffer_size);
-      // NOLINTNEXTLINE(bugprone-branch-clone)
-      if (element_size == 2) {
-        torch::utils::THP_encodeInt16Buffer(
-            (uint8_t*)le_buffer.get(),
-            (const int16_t*)data + i,
-            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
-            to_convert);
-      } else if (element_size == 4) {
-        torch::utils::THP_encodeInt32Buffer(
-            (uint8_t*)le_buffer.get(),
-            (const int32_t*)data + i,
-            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
-            to_convert);
-      } else if (element_size == 8) {
-        torch::utils::THP_encodeInt64Buffer(
-            (uint8_t*)le_buffer.get(),
-            (const int64_t*)data + i,
-            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
-            to_convert);
-      }
-      doWrite(fd, le_buffer.get(), to_convert * element_size);
-    }
-  }
-}
-
-template void THPStorage_(writeFileRaw<int>)(c10::StorageImpl *self, int fd, bool save_size, uint64_t element_size);
-template void THPStorage_(writeFileRaw<PyObject*>)(c10::StorageImpl *self, PyObject* fd, bool save_size, uint64_t element_size);
-
-template <class io>
-c10::intrusive_ptr<c10::StorageImpl> THPStorage_(readFileRaw)(
-    io file, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size)
-{
-  c10::OptionalDeviceGuard guard;
-  if (storage.defined()) {
-    guard.reset_device(storage->device());
-  }
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  scalar_t *data;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t size;
-  doRead(file, &size, sizeof(int64_t));
-  int64_t nbytes = element_size * size;
-  if (torch::utils::THP_nativeByteOrder() ==
-      torch::utils::THPByteOrder::THP_BIG_ENDIAN) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t nsize; // convert little endian storage to big endian cpu
-    nsize = nbytes;
-    torch::utils::THP_decodeInt64Buffer(
-        &nbytes, (const uint8_t*)&nsize, torch::utils::THP_nativeByteOrder(), 1);
-  }
-  if (!storage.defined()) {
-    storage = c10::make_intrusive<at::StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      nbytes,
-      c10::GetDefaultCPUAllocator(),
-      /*resizable=*/true);
-  } else {
-    int64_t _storage_nbytes = storage->nbytes();
-    TORCH_CHECK(
-        _storage_nbytes == nbytes,
-        "storage has wrong byte size: expected %ld got %ld",
-        nbytes,
-        _storage_nbytes);
-  }
-
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::unique_ptr<char[]> cpu_data;
-
-  if (storage->device_type() == at::kCPU) {
-    data = storage->data<scalar_t>();
-  } else {
-    cpu_data = std::unique_ptr<char[]>(new char[nbytes]);
-    data = (scalar_t*)cpu_data.get();
-  }
-
-  // fast track for bytes and little endian
-  if (element_size == 1 ||
-      torch::utils::THP_nativeByteOrder() ==
-          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
-    doRead(file, data, storage->nbytes());
-  } else {
-    int64_t buffer_size = std::min(size, (int64_t)5000);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * element_size]);
-
-    for (int64_t i = 0; i < size; i += buffer_size) {
-      size_t to_convert = std::min(size - i, buffer_size);
-      doRead(file, le_buffer.get(), element_size * to_convert);
-
-      // NOLINTNEXTLINE(bugprone-branch-clone)
-      if (element_size == 2) {
-        torch::utils::THP_decodeInt16Buffer(
-            (int16_t*)data + i,
-            le_buffer.get(),
-            torch::utils::THP_nativeByteOrder(),
-            to_convert);
-      } else if (element_size == 4) {
-        torch::utils::THP_decodeInt32Buffer(
-            (int32_t*)data + i,
-            le_buffer.get(),
-            torch::utils::THP_nativeByteOrder(),
-            to_convert);
-      } else if (element_size == 8) {
-        torch::utils::THP_decodeInt64Buffer(
-            (int64_t*)data + i,
-            le_buffer.get(),
-            torch::utils::THP_nativeByteOrder(),
-            to_convert);
-      }
-    }
-  }
-
-#ifdef USE_CUDA
-  if (storage->device_type() == at::kCUDA) {
-    C10_CUDA_CHECK(cudaMemcpy(storage->data<scalar_t>(), data, nbytes, cudaMemcpyHostToDevice));
-  }
-#endif
-  return storage;
-}
-
-template c10::intrusive_ptr<c10::StorageImpl> THPStorage_(readFileRaw<int>)(
-    int fd, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size);
-template c10::intrusive_ptr<c10::StorageImpl> THPStorage_(readFileRaw<PyObject*>)(
-    PyObject* fd, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size);
-
-#endif
--- a/torch/csrc/generic/serialization.h
+++ b/torch/csrc/generic/serialization.h
@ -1,12 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "torch/csrc/generic/serialization.h"
-#else
-
-template <class io>
-void THPStorage_(writeFileRaw)(c10::StorageImpl *self, io fd, bool save_size, uint64_t element_size);
-
-template <class io>
-c10::intrusive_ptr<c10::StorageImpl> THPStorage_(readFileRaw)(
-    io fd, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size);
-
-#endif
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@ -3,6 +3,7 @@

 #include <torch/csrc/THP.h>
 #include <torch/csrc/serialization.h>
+#include <c10/core/CPUAllocator.h>

 template <class io>
 Py_ssize_t doPartialRead(io fildes, void* buf, size_t nbytes);
@ -167,6 +168,183 @@ void doWrite(io fildes, void* raw_buf, size_t nbytes) {
  }
 }

-// NOLINTNEXTLINE(bugprone-suspicious-include)
-#include <torch/csrc/generic/serialization.cpp>
-#include <torch/csrc/THGenerateByteType.h>
+// save_save is necessary since the old eager format saved storages as
+// [size + data], but the v1.5 eager format removes this since size is saved in
+// the filesize.
+template <class io>
+void THPStorage_writeFileRaw(c10::StorageImpl *self, io fd, bool save_size, uint64_t element_size)
+{
+  c10::DeviceGuard guard(self->device());
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint8_t *data;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::unique_ptr<char[]> cpu_data;
+  int64_t size_bytes = self->nbytes();
+  int64_t numel = size_bytes / element_size;
+  if (self->device_type() == at::kCPU) {
+    data = self->data<uint8_t>();
+#ifdef USE_CUDA
+  } else if (self->device_type() == at::kCUDA) {
+    cpu_data = std::unique_ptr<char[]>(new char[size_bytes]);
+    data = (uint8_t*)cpu_data.get();
+    C10_CUDA_CHECK(cudaMemcpy(
+        data,
+        self->data<uint8_t>(),
+        size_bytes,
+        cudaMemcpyDeviceToHost));
+#endif
+  } else {
+    TORCH_CHECK(false, "writeFileRaw: Device not recognized: ", self->device_type());
+  }
+  if (save_size) {
+    if (torch::utils::THP_nativeByteOrder() ==
+        torch::utils::THPByteOrder::THP_LITTLE_ENDIAN)
+      doWrite(fd, &numel, sizeof(int64_t));
+    else {
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      int64_t nsize; // convert big endian cpu to little endian storage
+      torch::utils::THP_encodeInt64Buffer(
+          (uint8_t*)&nsize,
+          (const int64_t*)&numel,
+          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
+          1);
+      doWrite(fd, &nsize, sizeof(int64_t));
+    }
+  }
+  // fast track for bytes and little endian
+  if (element_size == 1 ||
+      torch::utils::THP_nativeByteOrder() ==
+          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
+    doWrite(fd, data, size_bytes);
+  } else {
+    int64_t buffer_size = std::min(numel, (int64_t)5000);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * element_size]);
+    for (int64_t i = 0; i < numel; i += buffer_size) {
+      size_t to_convert = std::min(numel - i, buffer_size);
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+      if (element_size == 2) {
+        torch::utils::THP_encodeInt16Buffer(
+            (uint8_t*)le_buffer.get(),
+            (const int16_t*)data + i,
+            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      } else if (element_size == 4) {
+        torch::utils::THP_encodeInt32Buffer(
+            (uint8_t*)le_buffer.get(),
+            (const int32_t*)data + i,
+            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      } else if (element_size == 8) {
+        torch::utils::THP_encodeInt64Buffer(
+            (uint8_t*)le_buffer.get(),
+            (const int64_t*)data + i,
+            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
+            to_convert);
+      }
+      doWrite(fd, le_buffer.get(), to_convert * element_size);
+    }
+  }
+}
+
+template void THPStorage_writeFileRaw<int>(c10::StorageImpl *self, int fd, bool save_size, uint64_t element_size);
+template void THPStorage_writeFileRaw<PyObject*>(c10::StorageImpl *self, PyObject* fd, bool save_size, uint64_t element_size);
+
+template <class io>
+c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
+    io file, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size)
+{
+  c10::OptionalDeviceGuard guard;
+  if (storage.defined()) {
+    guard.reset_device(storage->device());
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint8_t *data;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int64_t size;
+  doRead(file, &size, sizeof(int64_t));
+  int64_t nbytes = element_size * size;
+  if (torch::utils::THP_nativeByteOrder() ==
+      torch::utils::THPByteOrder::THP_BIG_ENDIAN) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int64_t nsize; // convert little endian storage to big endian cpu
+    nsize = nbytes;
+    torch::utils::THP_decodeInt64Buffer(
+        &nbytes, (const uint8_t*)&nsize, torch::utils::THP_nativeByteOrder(), 1);
+  }
+  if (!storage.defined()) {
+    storage = c10::make_intrusive<at::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      nbytes,
+      c10::GetDefaultCPUAllocator(),
+      /*resizable=*/true);
+  } else {
+    int64_t _storage_nbytes = storage->nbytes();
+    TORCH_CHECK(
+        _storage_nbytes == nbytes,
+        "storage has wrong byte size: expected %ld got %ld",
+        nbytes,
+        _storage_nbytes);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::unique_ptr<char[]> cpu_data;
+
+  if (storage->device_type() == at::kCPU) {
+    data = storage->data<uint8_t>();
+  } else {
+    cpu_data = std::unique_ptr<char[]>(new char[nbytes]);
+    data = (uint8_t*)cpu_data.get();
+  }
+
+  // fast track for bytes and little endian
+  if (element_size == 1 ||
+      torch::utils::THP_nativeByteOrder() ==
+          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
+    doRead(file, data, storage->nbytes());
+  } else {
+    int64_t buffer_size = std::min(size, (int64_t)5000);
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::unique_ptr<uint8_t[]> le_buffer(new uint8_t[buffer_size * element_size]);
+
+    for (int64_t i = 0; i < size; i += buffer_size) {
+      size_t to_convert = std::min(size - i, buffer_size);
+      doRead(file, le_buffer.get(), element_size * to_convert);
+
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+      if (element_size == 2) {
+        torch::utils::THP_decodeInt16Buffer(
+            (int16_t*)data + i,
+            le_buffer.get(),
+            torch::utils::THP_nativeByteOrder(),
+            to_convert);
+      } else if (element_size == 4) {
+        torch::utils::THP_decodeInt32Buffer(
+            (int32_t*)data + i,
+            le_buffer.get(),
+            torch::utils::THP_nativeByteOrder(),
+            to_convert);
+      } else if (element_size == 8) {
+        torch::utils::THP_decodeInt64Buffer(
+            (int64_t*)data + i,
+            le_buffer.get(),
+            torch::utils::THP_nativeByteOrder(),
+            to_convert);
+      }
+    }
+  }
+
+#ifdef USE_CUDA
+  if (storage->device_type() == at::kCUDA) {
+    C10_CUDA_CHECK(cudaMemcpy(storage->data<uint8_t>(), data, nbytes, cudaMemcpyHostToDevice));
+  }
+#endif
+  return storage;
+}
+
+template c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw<int>(
+    int fd, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size);
+template c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw<PyObject*>(
+    PyObject* fd, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size);
--- a/torch/csrc/serialization.h
+++ b/torch/csrc/serialization.h
@ -1,13 +1,17 @@
 #ifndef THP_SERIALIZATION_INC
 #define THP_SERIALIZATION_INC

-#include <torch/csrc/generic/serialization.h>
-#include <torch/csrc/THGenerateByteType.h>
-
 template <class io>
 void doRead(io fildes, void* buf, size_t nbytes);

 template <class io>
 void doWrite(io fildes, void* buf, size_t nbytes);

+template <class io>
+void THPStorage_writeFileRaw(c10::StorageImpl *self, io fd, bool save_size, uint64_t element_size);
+
+template <class io>
+c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
+    io fd, c10::intrusive_ptr<c10::StorageImpl> storage, uint64_t element_size);
+
 #endif