[aoti][mps] mps constants support (#154287)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/154287 Approved by: https://github.com/malfet ghstack dependencies: #155752
2025-12-06 12:20:52 +01:00 · 2025-06-12 11:51:51 -07:00 · 2025-06-12 11:51:51 -07:00 · a4ab392251
commit a4ab392251
parent 8821a9dc4e
13 changed files with 126 additions and 4 deletions
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@ -78,6 +78,9 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
  virtual uint32_t acquireEvent(bool enable_timing) const {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
+  Device getDeviceFromPtr(void* data) const override {
+    TORCH_CHECK(false, "Cannot get device of pointer on MPS without ATen_mps library. ");
+  }
  virtual void releaseEvent(uint32_t event_id) const {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@ -1,6 +1,7 @@
 //  Copyright © 2022 Apple Inc.

 #pragma once
+#include <ATen/Device.h>
 #include <c10/core/Allocator.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
@ -70,4 +71,8 @@ TORCH_API bool is_available();
 TORCH_API bool is_macos_13_or_newer(MacOSVersion version);
 TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);

+inline Device getDeviceFromPtr(void* ptr) {
+  return {c10::DeviceType::MPS, 0};
+}
+
 } // namespace at::mps
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@ -18,6 +18,8 @@ struct MPSHooks : public at::MPSHooksInterface {
  bool hasMPS() const override;
  bool isOnMacOSorNewer(unsigned major, unsigned minor) const override;

+  Device getDeviceFromPtr(void* data) const override;
+
  // MPSGeneratorImpl interface
  const Generator& getDefaultGenerator(
      DeviceIndex device_index = -1) const override;
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@ -129,6 +129,10 @@ void MPSHooks::recordEvent(uint32_t event_id) const {
  at::mps::getMPSEventPool()->recordEvent(event_id, /* syncEvent*/ true);
 }

+Device MPSHooks::getDeviceFromPtr(void* data) const {
+  return at::mps::getDeviceFromPtr(data);
+}
+
 void MPSHooks::waitForEvent(uint32_t event_id) const {
  at::mps::getMPSEventPool()->waitForEvent(event_id, /* syncEvent*/ true);
 }
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -709,6 +709,7 @@ list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
 if(USE_MPS)
  list(APPEND Caffe2_CPU_SRCS ${Caffe2_MPS_SRCS})
  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/shim_mps.cpp)
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/shim_mps.mm)
  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_runner/model_container_runner_mps.cpp)
  if(CAN_COMPILE_METAL)
      file(TOUCH ${CMAKE_BINARY_DIR}/aten/src/ATen/metallib_dummy.cpp)
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@ -223,6 +223,20 @@ class MPSBasicTestsAOTI(TestCase):
        m = M().to("mps")
        self.check_model(m, inp)

+    def test_two_const(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.y = torch.ones(3, 3, device="mps")
+                self.z = torch.full((3, 3), 2, device="mps")
+
+            def forward(self, x):
+                return x + self.y + self.z
+
+        inp = (torch.ones(3, 3, device="mps"),)
+        m = Model().to(device="mps")
+        self.check_model(m, inp)
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@ -1313,6 +1313,9 @@ def get_cpp_torch_device_options(
                "in https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support."
            )

+    if device_type == "mps":
+        definitions.append(" USE_MPS")
+
    if config.is_fbcode():
        include_dirs.append(build_paths.sdk_include)

--- a/torch/csrc/inductor/aoti_runtime/model.h
+++ b/torch/csrc/inductor/aoti_runtime/model.h
@ -15,11 +15,14 @@
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+#ifdef USE_MPS
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#endif // USE_MPS
 #ifdef USE_XPU
 #include <torch/csrc/inductor/aoti_runtime/utils_xpu.h>
 #else
 #include <torch/csrc/inductor/aoti_runtime/utils.h>
-#endif
+#endif // USE_XPU
 #include <torch/csrc/inductor/aoti_runtime/constant_type.h>

 #define AOTI_RUNTIME_CHECK(EXPR, MSG) \
@ -74,6 +77,15 @@ RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
  return RAIIDataPtr(data_ptr, deleter);
 }

+#elif defined(USE_MPS)
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  void* data_ptr = nullptr;
+  aoti_torch_mps_malloc(&data_ptr, num_bytes);
+  auto deleter = [](void* ptr) { aoti_torch_mps_free(ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
 #else

 RAIIDataPtr RAII_cpuMalloc(size_t num_bytes) {
@ -113,7 +125,7 @@ inline void parse_device_str(
  } else if (sm[1].str() == "xpu") {
    device_type = aoti_torch_device_type_xpu();
 #endif
-#ifdef __APPLE__
+#ifdef USE_MPS
  } else if (sm[1].str() == "mps") {
    device_type = aoti_torch_device_type_mps();
 #endif
@ -165,6 +177,11 @@ class AOTInductorModelBase {
      aoti_torch_set_current_xpu_device(device_idx_);
    }
 #endif // USE_XPU
+#ifdef USE_MPS
+    if (device_idx_ == -1) {
+      device_idx_ = 0;
+    }
+#endif // USE_MPS
  }

  // NOLINTNEXTLINE(modernize-use-equals-default)
@ -299,7 +316,7 @@ class AOTInductorModelBase {
    if (!include_weights) {
      return;
    }
-#if defined(USE_CUDA) || defined(USE_XPU)
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
    constant_blob_ = RAII_gpuMalloc(blob_size);
 #else
    constant_blob_ = RAII_cpuMalloc(blob_size);
@ -327,7 +344,12 @@ class AOTInductorModelBase {
      auto ndim = this->constant_ndim(i);
      auto size = this->constant_shape(i);
      auto stride = this->constant_stride(i);
+#ifdef USE_MPS
+      auto offset = this->constant_offset(i) +
+          (constants_internal_offset[i] / aoti_torch_dtype_element_size(dtype));
+#else
      auto offset = this->constant_offset(i);
+#endif
      auto layout = this->constant_layout(i);
      auto opaque_metadata_ptr = this->opaque_metadata(i);
      auto opaque_metadata_size = this->opaque_metadata_size(i);
@ -390,6 +412,14 @@ class AOTInductorModelBase {
          _get_constants_start() + bytes_read,
          data_size,
          cudaMemcpyHostToDevice));
+#elif USE_MPS
+      aoti_torch_mps_memcpy(
+          constants_ptr,
+          constant_offset,
+          bytes_read,
+          data_size,
+          _get_constants_start());
+      return constants_ptr;
 #else
      memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size);
 #endif
--- a/torch/csrc/inductor/aoti_runtime/model_container.h
+++ b/torch/csrc/inductor/aoti_runtime/model_container.h
@ -666,7 +666,7 @@ class AOTInductorModelContainer {
  std::shared_mutex model_exec_mutex_;

  RAIIDataPtr allocate_constant_blob() {
-#if defined(USE_CUDA) || defined(USE_XPU)
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
    return RAII_gpuMalloc(blob_size_);
 #else
    return RAII_cpuMalloc(blob_size_);
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@ -129,6 +129,7 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bool();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex32();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex64();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex128();
+AOTI_TORCH_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);

 AOTI_TORCH_EXPORT int32_t aoti_torch_layout_strided();
 AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_coo();
--- a/torch/csrc/inductor/aoti_torch/c/shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
@ -15,6 +15,18 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg(
    unsigned idx,
    AtenTensorHandle tensor);

+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_mps_malloc(void** buffer, size_t num_bytes);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_free(void* ptr);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_memcpy(
+    void* buffer,
+    size_t constant_offset,
+    size_t bytes_read,
+    size_t data_size,
+    uint8_t* constants_start);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@ -253,6 +253,11 @@ void aoti_torch_grad_mode_set_enabled(bool enabled) {
  return c10::GradMode::set_enabled(enabled);
 }

+size_t aoti_torch_dtype_element_size(int32_t dtype) {
+  auto scalar_type = static_cast<at::ScalarType>(dtype);
+  return c10::elementSize(scalar_type);
+}
+
 AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor) {
  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
    at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
--- a/torch/csrc/inductor/aoti_torch/shim_mps.mm
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.mm
@ -0,0 +1,42 @@
+#include <ATen/native/mps/MetalShaderLibrary.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSDevice.h>
+
+
+using namespace torch::aot_inductor;
+
+AOTITorchError aoti_torch_mps_malloc(
+    void** buffer,
+    size_t num_bytes) {
+  if (num_bytes == 0) {
+    *buffer = nullptr;
+    return AOTI_TORCH_SUCCESS;
+  }
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+      id<MTLDevice> device = at::mps::MPSDevice::getInstance()->device();
+      TORCH_CHECK(device, "Failed to get MPS device");
+      id<MTLBuffer> metal_buffer = [device newBufferWithLength:num_bytes options:MTLResourceCPUCacheModeWriteCombined | MTLResourceStorageModeShared];
+      TORCH_CHECK(metal_buffer, "Failed to allocate memory on MPS device");
+      *buffer = (void*)metal_buffer;
+  });
+}
+
+AOTITorchError aoti_torch_mps_free(
+    void* ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto metal_buffer = (id<MTLBuffer>)ptr;
+    [metal_buffer release];
+  });
+}
+
+
+AOTITorchError
+aoti_torch_mps_memcpy(void* buffer, size_t constant_offset, size_t bytes_read, size_t data_size, uint8_t* constants_start) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto metal_buffer = (id<MTLBuffer>)buffer;
+    auto buffer_pointer = static_cast<uint8_t*>([metal_buffer contents]);
+    memcpy(buffer_pointer + constant_offset, constants_start + bytes_read, data_size);
+  });
+}