From d98fa4a1033079ae19591aff5c3c354617bff482 Mon Sep 17 00:00:00 2001
From: Dylan Maloy <dmaloy@meta.com>
Date: Wed, 25 Jun 2025 22:43:40 +0000
Subject: [PATCH] implement SR's storage group planning algorithm (#156715)

Summary: att

Test Plan:
tested on a localnet. it's ~15% worse performance than greedy-by-size, but more performant.

local:
gbs: 110656b
dsg: 131584b

local_ro:
gbs: 38208
dsg: 44544

Differential Revision: D75653840

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156715
Approved by: https://github.com/zhxchen17
---
 build_variables.bzl                           |   1 +
 test/cpp/nativert/CMakeLists.txt              |   1 +
 .../test_layout_planner_algorithm.cpp         |  23 +++
 .../executor/memory/DisjointStorageGroups.cpp | 175 ++++++++++++++++++
 .../executor/memory/DisjointStorageGroups.h   |  10 +
 .../executor/memory/LayoutPlannerSettings.h   |   1 +
 6 files changed, 211 insertions(+)
 create mode 100644 torch/nativert/executor/memory/DisjointStorageGroups.cpp
 create mode 100644 torch/nativert/executor/memory/DisjointStorageGroups.h
diff --git a/build_variables.bzl b/build_variables.bzl
index 788c7820cc6..da49ed05dad 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -617,6 +617,7 @@ libtorch_nativert_sources = [
     "torch/nativert/executor/ParallelGraphExecutor.cpp",
     "torch/nativert/kernels/CallTorchBindKernel.cpp",
     "torch/nativert/kernels/PrimKernelRegistry.cpp",
+    "torch/nativert/executor/memory/DisjointStorageGroups.cpp",
 ]
 
 torch_mobile_tracer_sources = [
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
index 9f2ad858dfd..81ca5869774 100644
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -20,6 +20,7 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/GreedyBySize.cpp
   ${TORCH_ROOT}/torch/nativert/executor/memory/Bump.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/memory/DisjointStorageGroups.cpp
 )
 
 add_executable(test_nativert
diff --git a/test/cpp/nativert/test_layout_planner_algorithm.cpp b/test/cpp/nativert/test_layout_planner_algorithm.cpp
index 72b29db861e..0d4f8fb0d27 100644
--- a/test/cpp/nativert/test_layout_planner_algorithm.cpp
+++ b/test/cpp/nativert/test_layout_planner_algorithm.cpp
@@ -2,6 +2,7 @@
 #include <gtest/gtest.h>
 
 #include <torch/nativert/executor/memory/Bump.h>
+#include <torch/nativert/executor/memory/DisjointStorageGroups.h>
 #include <torch/nativert/executor/memory/GreedyBySize.h>
 
 using namespace ::testing;
@@ -61,3 +62,25 @@ TEST(LayoutPlannerAlgorithmTests, TestBump) {
 
   EXPECT_EQ(result.total_size, offset);
 }
+
+TEST(LayoutPlannerAlgorithmTests, TestStorageGroup) {
+  auto specs = create_test_allocation_specs();
+  auto result = DisjointStorageGroupsPlanner(create_test_allocation_specs());
+
+  auto& allocations = result.allocations;
+
+  EXPECT_EQ(allocations[0].offset, 0);
+  EXPECT_EQ(allocations[1].offset, 36);
+  EXPECT_EQ(allocations[2].offset, 0);
+  EXPECT_EQ(allocations[3].offset, 100);
+  EXPECT_EQ(allocations[4].offset, 140);
+  EXPECT_EQ(allocations[5].offset, 36);
+  EXPECT_EQ(allocations[6].offset, 140);
+  EXPECT_EQ(allocations[7].offset, 100);
+
+  for (auto&& [i, spec] : c10::enumerate(specs)) {
+    EXPECT_EQ(allocations[i].size, spec.size);
+  }
+
+  EXPECT_EQ(result.total_size, 150);
+}
diff --git a/torch/nativert/executor/memory/DisjointStorageGroups.cpp b/torch/nativert/executor/memory/DisjointStorageGroups.cpp
new file mode 100644
index 00000000000..3d9fac55a7c
--- /dev/null
+++ b/torch/nativert/executor/memory/DisjointStorageGroups.cpp
@@ -0,0 +1,175 @@
+#include <torch/nativert/executor/memory/DisjointStorageGroups.h>
+
+#include <list>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+#include <c10/util/irange.h>
+
+namespace {
+
+using namespace torch::nativert;
+
+// A StorageGroup represents a collection of allocations that share backing
+// storage
+class StorageGroup {
+ public:
+  // every storage group must contain at least one allocation spec.
+  explicit StorageGroup(const AllocationSpec* spec)
+      : max_spec_size_(spec->size),
+        lifetime_(spec->lifetime),
+        spec_group_({spec}) {}
+
+  void add_spec(const AllocationSpec* spec) {
+    spec_group_.push_back(spec);
+    max_spec_size_ = std::max(max_spec_size_, spec->size);
+    TORCH_DCHECK_LT(lifetime_.end, spec->lifetime.end);
+    lifetime_.end = spec->lifetime.end;
+    is_free_ = false;
+  }
+
+  const std::vector<const AllocationSpec*>& spec_group() const {
+    return spec_group_;
+  }
+
+  size_t max_spec_size() const {
+    return max_spec_size_;
+  }
+
+  size_t num_specs() const {
+    return spec_group_.size();
+  }
+
+  const AllocationLifetime& lifetime() const {
+    return lifetime_;
+  }
+
+  bool is_free() const {
+    return is_free_;
+  }
+
+  void set_free(bool is_free) {
+    is_free_ = is_free;
+  }
+
+ private:
+  // whether or not this storage group is free
+  // to add new specs
+  bool is_free_{false};
+  // represents the amount of memory that will be
+  // allocated for all specs in this group...
+  size_t max_spec_size_;
+  // the lifetime of this storage group
+  AllocationLifetime lifetime_;
+  // all the specs in this group
+  std::vector<const AllocationSpec*> spec_group_;
+};
+
+} // namespace
+
+namespace torch::nativert {
+
+LayoutPlan DisjointStorageGroupsPlanner(
+    const std::vector<AllocationSpec>& allocation_specs) {
+  struct CompareAllocationSpecsBySize {
+    bool operator()(const AllocationSpec* a, const AllocationSpec* b)
+        const /* noexcept */
+    {
+      return a->size > b->size;
+    }
+  };
+
+  std::vector<
+      std::multiset<const AllocationSpec*, CompareAllocationSpecsBySize>>
+      allocation_indices;
+  std::vector<std::vector<const AllocationSpec*>> deallocation_indices;
+
+  for (const auto& spec : allocation_specs) {
+    size_t alloc_index = spec.lifetime.start;
+    size_t dealloc_index = spec.lifetime.end;
+
+    TORCH_DCHECK_LT(alloc_index, dealloc_index);
+
+    if (alloc_index >= allocation_indices.size()) {
+      allocation_indices.resize(alloc_index + 1);
+    }
+
+    if (dealloc_index >= deallocation_indices.size()) {
+      deallocation_indices.resize(dealloc_index + 1);
+    }
+
+    allocation_indices[alloc_index].insert(&spec);
+    deallocation_indices[dealloc_index].emplace_back(&spec);
+  }
+
+  // don't want to invalidate pointers
+  // so let's make this a list
+  std::list<StorageGroup> storage_groups;
+  // maps each AllocationSpec to its assigned storage group.
+  c10::FastMap<const AllocationSpec*, StorageGroup*> spec_to_storage_group;
+  // stores the set of storage groups that
+  // are available for re-use.
+  std::vector<StorageGroup*> free_storage_groups;
+
+  auto createStorageGroup = [&](const AllocationSpec* spec) {
+    auto& group = storage_groups.emplace_back(spec);
+    spec_to_storage_group.emplace(spec, &group);
+  };
+
+  auto assignToAvailableStorageGroup = [&](const AllocationSpec* spec) {
+    DCHECK(!free_storage_groups.empty());
+    auto* storage_group = free_storage_groups.back();
+    TORCH_DCHECK_NOTNULL(storage_group);
+    TORCH_DCHECK_EQ(storage_group->is_free(), true);
+    storage_group->add_spec(spec);
+    spec_to_storage_group.emplace(spec, storage_group);
+    free_storage_groups.pop_back();
+  };
+
+  for (const auto i : c10::irange(allocation_indices.size())) {
+    for (auto* spec : allocation_indices[i]) {
+      TORCH_DCHECK_NOTNULL(spec);
+      if (free_storage_groups.empty()) {
+        createStorageGroup(spec);
+      } else {
+        assignToAvailableStorageGroup(spec);
+      }
+    }
+
+    if (i < deallocation_indices.size()) {
+      for (auto* spec : deallocation_indices[i]) {
+        TORCH_DCHECK_NOTNULL(spec);
+        auto* storage_group = spec_to_storage_group.at(spec);
+        if (!storage_group->is_free() &&
+            storage_group->lifetime().end == spec->lifetime.end) {
+          storage_group->set_free(true);
+          free_storage_groups.push_back(storage_group);
+        }
+      }
+    }
+  }
+
+  LayoutPlan plan;
+
+  c10::FastMap<const StorageGroup*, size_t> storage_group_to_offset;
+  size_t offset = 0;
+  for (const auto& storage_group : storage_groups) {
+    storage_group_to_offset.emplace(&storage_group, offset);
+    offset += storage_group.max_spec_size();
+  }
+
+  plan.total_size = offset;
+  plan.allocations.reserve(allocation_specs.size());
+
+  for (const auto& spec : allocation_specs) {
+    // specs in storage groups lifetime's shouldn't be overlapping
+    // so we can just set their offset to the offset of the group
+    plan.allocations.emplace_back(Allocation{
+        spec.size,
+        storage_group_to_offset.at(spec_to_storage_group.at(&spec))});
+  }
+
+  return plan;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/DisjointStorageGroups.h b/torch/nativert/executor/memory/DisjointStorageGroups.h
new file mode 100644
index 00000000000..8131a7000da
--- /dev/null
+++ b/torch/nativert/executor/memory/DisjointStorageGroups.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/nativert/executor/memory/LayoutPlannerAlgorithm.h>
+
+namespace torch::nativert {
+
+LayoutPlan DisjointStorageGroupsPlanner(
+    const std::vector<AllocationSpec>& allocation_specs);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/LayoutPlannerSettings.h b/torch/nativert/executor/memory/LayoutPlannerSettings.h
index 2c6a75cfd86..8ade27997bd 100644
--- a/torch/nativert/executor/memory/LayoutPlannerSettings.h
+++ b/torch/nativert/executor/memory/LayoutPlannerSettings.h
@@ -7,6 +7,7 @@ namespace torch::nativert {
 enum class LayoutPlannerAlgorithmType {
   Bump,
   GreedyBySize,
+  DisjointStorageGroups,
 };
 
 class LayoutManagerSettings {