From 2e440e39a6cc99057b01051aa8ef565e08ac67d1 Mon Sep 17 00:00:00 2001
From: Shangdi Yu <shangdiy@meta.com>
Date: Wed, 14 May 2025 15:26:50 +0000
Subject: [PATCH] [nativert] Move Placement to pytorch core (#152953)

Summary:
Move Placement to pytorch core.

Using `torch::nativert::isSameDevice` explicitly in code to avoid confusion with the `isSameDevice` in torch namespace.

Test Plan:
```
buck run fbcode//mode/dev-nosan  //caffe2/test/cpp/nativert:placement_test

./bin/test_nativert
```

OSS and internal CI

Differential Revision: D74190745

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152953
Approved by: https://github.com/Skylion007, https://github.com/swolchok, https://github.com/zhxchen17, https://github.com/cyyever
---
 build_variables.bzl                        |   2 +
 test/cpp/nativert/CMakeLists.txt           |   2 +
 test/cpp/nativert/test_placement.cpp       | 104 +++++++++++++++++++++
 torch/nativert/executor/Placement.cpp      |  61 ++++++++++++
 torch/nativert/executor/Placement.h        |  57 +++++++++++
 torch/nativert/executor/PlacementUtils.cpp |  37 ++++++++
 6 files changed, 263 insertions(+)
 create mode 100644 test/cpp/nativert/test_placement.cpp
 create mode 100644 torch/nativert/executor/Placement.cpp
 create mode 100644 torch/nativert/executor/Placement.h
 create mode 100644 torch/nativert/executor/PlacementUtils.cpp

diff --git a/build_variables.bzl b/build_variables.bzl
index 02d12c17000..7cac3da1210 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -590,6 +590,8 @@ libtorch_core_jit_sources = sorted(jit_sources_full)
 
 libtorch_nativert_sources = [
     "torch/nativert/graph/TensorMeta.cpp",
+    "torch/nativert/executor/Placement.cpp",
+    "torch/nativert/executor/PlacementUtils.cpp",
 ]
 
 torch_mobile_tracer_sources = [
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
index 48c4ed7adf5..f702e996836 100644
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -6,6 +6,7 @@ file(GLOB_RECURSE NATIVERT_ALL_TEST_FILES "${NATIVERT_TEST_ROOT}/test_*.cpp")
 set(NATIVERT_TEST_SRCS
   ${NATIVERT_ALL_TEST_FILES}
   ${TORCH_ROOT}/torch/nativert/graph/TensorMeta.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/PlacementUtils.cpp
 )
 
 add_executable(test_nativert
@@ -19,6 +20,7 @@ target_compile_definitions(test_nativert PRIVATE USE_GTEST)
 set(NATIVERT_TEST_DEPENDENCIES torch gtest)
 
 target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
+target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
 target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
 
 if(USE_CUDA)
diff --git a/test/cpp/nativert/test_placement.cpp b/test/cpp/nativert/test_placement.cpp
new file mode 100644
index 00000000000..e88ae20e1de
--- /dev/null
+++ b/test/cpp/nativert/test_placement.cpp
@@ -0,0 +1,104 @@
+
+#include <c10/core/Device.h>
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+#include <torch/nativert/executor/Placement.h>
+
+using namespace ::testing;
+
+namespace torch::nativert {
+TEST(PlacementTest, NormalizeDevice) {
+  c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
+  c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
+  cpuDevice1.set_index(1);
+
+  EXPECT_EQ(normalizeDevice(cpuDevice), cpuDevice);
+  EXPECT_NE(normalizeDevice(cpuDevice1), cpuDevice1);
+
+  c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
+  c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
+  EXPECT_EQ(normalizeDevice(cudaDevice), c10::Device(c10::DeviceType::CUDA, 0));
+  EXPECT_EQ(
+      normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 1));
+
+  EXPECT_NE(
+      normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 0));
+}
+
+TEST(PlacementTest, IsSameDevice) {
+  c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
+  c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
+  cpuDevice1.set_index(1);
+
+  EXPECT_TRUE(isSameDevice(cpuDevice, cpuDevice));
+  EXPECT_TRUE(isSameDevice(cpuDevice, cpuDevice1));
+
+  c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
+  c10::Device cudaDevice0 = c10::Device(c10::DeviceType::CUDA, 0);
+  c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
+  EXPECT_TRUE(isSameDevice(cudaDevice, cudaDevice0));
+  EXPECT_FALSE(isSameDevice(cudaDevice0, cudaDevice1));
+
+  EXPECT_FALSE(isSameDevice(cudaDevice0, cpuDevice));
+}
+
+TEST(PlacementTest, PlacementDefaultOnly) {
+  Placement placement(c10::Device(c10::DeviceType::CUDA, 0));
+
+  std::ostringstream os;
+  os << placement;
+  EXPECT_EQ(os.str(), "|cuda:0");
+
+  c10::Device cuda0 = c10::Device(c10::DeviceType::CUDA, 0);
+  c10::Device cuda1 = c10::Device(c10::DeviceType::CUDA, 1);
+  c10::Device cuda2 = c10::Device(c10::DeviceType::CUDA, 2);
+
+  EXPECT_EQ(placement.getMappedDevice(cuda0), cuda0);
+  EXPECT_EQ(placement.getMappedDevice(cuda1), cuda0);
+  EXPECT_EQ(placement.getMappedDevice(cuda2), cuda0);
+}
+
+TEST(PlacementTest, PlacementBasic) {
+  Placement placement(
+      {{c10::Device(c10::DeviceType::CPU), c10::Device(c10::DeviceType::CPU)},
+       {c10::Device(c10::DeviceType::CUDA, 0),
+        c10::Device(c10::DeviceType::CUDA, 1)},
+       {c10::Device(c10::DeviceType::CUDA, 1),
+        c10::Device(c10::DeviceType::CUDA, 2)}},
+      c10::Device(c10::DeviceType::CUDA, 0));
+
+  std::ostringstream os;
+  os << placement;
+  EXPECT_EQ(os.str(), "cpu|cpu,cuda:0|cuda:1,cuda:1|cuda:2,|cuda:0");
+
+  c10::Device cpu = c10::Device(c10::DeviceType::CPU);
+  c10::Device cuda0 = c10::Device(c10::DeviceType::CUDA, 0);
+  c10::Device cuda1 = c10::Device(c10::DeviceType::CUDA, 1);
+  c10::Device cuda2 = c10::Device(c10::DeviceType::CUDA, 2);
+  c10::Device cuda3 = c10::Device(c10::DeviceType::CUDA, 3);
+
+  EXPECT_EQ(placement.getMappedDevice(cpu), cpu);
+  EXPECT_EQ(placement.getMappedDevice(cuda0), cuda1);
+  EXPECT_EQ(placement.getMappedDevice(cuda1), cuda2);
+  EXPECT_EQ(placement.getMappedDevice(cuda2), cuda0);
+  EXPECT_EQ(placement.getMappedDevice(cuda3), cuda0);
+}
+
+TEST(PlacementTest, Placement) {
+  std::unordered_map<c10::Device, c10::Device> deviceMap1 = {
+      {c10::Device("cuda:0"), c10::Device("cuda:1")}};
+  Placement p1(deviceMap1);
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cpu")), c10::Device("cpu"));
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda")), c10::Device("cuda:1"));
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:1"));
+
+  std::unordered_map<c10::Device, c10::Device> deviceMap2 = {
+      {c10::Device("cpu"), c10::Device("cuda")}};
+  Placement p2(deviceMap2);
+  EXPECT_EQ(p2.getMappedDevice(c10::Device("cpu")), c10::Device("cuda:0"));
+  EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:0"));
+  EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:1")), c10::Device("cuda:1"));
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/Placement.cpp b/torch/nativert/executor/Placement.cpp
new file mode 100644
index 00000000000..be8b6e6df96
--- /dev/null
+++ b/torch/nativert/executor/Placement.cpp
@@ -0,0 +1,61 @@
+#include <torch/nativert/executor/Placement.h>
+
+#include <fmt/ostream.h>
+#include <ostream>
+
+namespace torch::nativert {
+
+std::ostream& operator<<(std::ostream& os, const Placement& placement) {
+  std::vector<std::pair<std::string, c10::Device>> sorted_keys;
+  sorted_keys.reserve(placement.deviceMap_.size());
+  for (const auto& pair : placement.deviceMap_) {
+    sorted_keys.emplace_back(pair.first.str(), pair.first);
+  }
+  std::sort(
+      sorted_keys.begin(), sorted_keys.end(), [](const auto& a, const auto& b) {
+        return a.first < b.first;
+      });
+
+  bool first = true;
+  for (const auto& pair : sorted_keys) {
+    if (!first) {
+      fmt::print(os, ",");
+    }
+    first = false;
+    const auto& key = pair.second;
+    const auto& value = placement.deviceMap_.at(key);
+    fmt::print(os, "{}|{}", pair.first, value.str());
+  }
+  if (placement.defaultDevice_.has_value()) {
+    fmt::print(os, "{}|{}", first ? "" : ",", placement.defaultDevice_->str());
+  }
+  return os;
+}
+
+Placement::Placement(std::optional<c10::Device> defaultDevice)
+    : Placement({}, defaultDevice) {}
+
+Placement::Placement(
+    const std::unordered_map<c10::Device, c10::Device>& deviceMap,
+    std::optional<c10::Device> defaultDevice) {
+  for (const auto& [srcDevice, dstDevice] : deviceMap) {
+    deviceMap_.try_emplace(
+        normalizeDevice(srcDevice), normalizeDevice(dstDevice));
+  }
+  if (defaultDevice.has_value()) {
+    defaultDevice_ = normalizeDevice(defaultDevice.value());
+  }
+}
+
+c10::Device Placement::getMappedDevice(const c10::Device& srcDevice) const {
+  auto it = deviceMap_.find(normalizeDevice(srcDevice));
+  if (it != deviceMap_.end()) {
+    return it->second;
+  }
+  if (defaultDevice_.has_value()) {
+    return defaultDevice_.value();
+  }
+  return srcDevice;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/Placement.h b/torch/nativert/executor/Placement.h
new file mode 100644
index 00000000000..9f9a2c627d2
--- /dev/null
+++ b/torch/nativert/executor/Placement.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/util/Logging.h>
+
+#include <optional>
+#include <unordered_map>
+
+namespace torch::nativert {
+
+/**
+ * This function returns a normalized version of the input device:
+ * - For CPU devices, the returned device will have no index (i.e., the default
+ * CPU device).
+ * - For CUDA devices, if no index is specified, index 0 is assumed.
+ * - For other device types, the function will raise an error.
+ *
+ * @param device The input c10::Device to normalize.
+ * @return A normalized c10::Device with standardized indexing.
+ *
+ * @throws c10::Error If the device type is not CPU or CUDA.
+ */
+
+c10::Device normalizeDevice(const c10::Device& device);
+
+/**
+ * Returns true if the two devices are the same and has the same device index
+ * (if cuda).
+ */
+bool isSameDevice(const c10::Device& device1, const c10::Device& device2);
+
+/**
+ * @brief A utility class for managing device placement mappings.
+ *
+ * The Placement class provides a way to map source devices to target devices.
+ * It supports both explicit per-device mappings and a default device fallback.
+ * This is the argument taken in NativeRT to map from model artifact device to
+ * the device it should run on.
+ */
+struct TORCH_API Placement {
+  Placement() = default;
+  explicit Placement(std::optional<c10::Device> defaultDevice);
+  explicit Placement(
+      const std::unordered_map<c10::Device, c10::Device>& deviceMap,
+      std::optional<c10::Device> defaultDevice = std::nullopt);
+  c10::Device getMappedDevice(const c10::Device& srcDevice) const;
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& os,
+      const Placement& obj);
+
+ protected:
+  std::unordered_map<c10::Device, c10::Device> deviceMap_;
+  std::optional<c10::Device> defaultDevice_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/PlacementUtils.cpp b/torch/nativert/executor/PlacementUtils.cpp
new file mode 100644
index 00000000000..988c9997ed0
--- /dev/null
+++ b/torch/nativert/executor/PlacementUtils.cpp
@@ -0,0 +1,37 @@
+#include <torch/nativert/executor/Placement.h>
+
+#include <fmt/ostream.h>
+
+namespace torch::nativert {
+
+c10::Device normalizeDevice(const c10::Device& device) {
+  // cpu device doesn't have index
+  // cuda device index must have a index
+  if (device.is_cpu()) {
+    return c10::Device(c10::DeviceType::CPU);
+  } else if (device.is_cuda()) {
+    return c10::Device(
+        c10::DeviceType::CUDA,
+        device.has_index() ? device.index() : static_cast<c10::DeviceIndex>(0));
+  } else {
+    TORCH_CHECK(false, "Unsupported device type", device);
+  }
+}
+
+bool isSameDevice(const c10::Device& a, const c10::Device& b) {
+  if (a.is_cpu()) {
+    return b.is_cpu();
+  }
+  if (a.is_cuda()) {
+    if (b.is_cuda()) {
+      auto aIndex = a.has_index() ? a.index() : 0;
+      auto bIndex = b.has_index() ? b.index() : 0;
+      return aIndex == bIndex;
+    } else {
+      return false;
+    }
+  }
+  TORCH_CHECK(false, "Unsupported device type", a, " and ", b);
+  return false;
+}
+} // namespace torch::nativert