[Static Runtime] Support recordio format input for benchmark (#67530)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/67530 Currently `ptvsc2_predictor_bench` only uses the first input of a given recordio file even when the record io file contains many inputs. This change extends `StaticRuntime::benchmark` to accept multiple input entries so that we can benchmark more extensibly and realistically using all the inputs in the recordio file. Test Plan: Tested `ptvsc2_predictor_bench` with / without this change executing the following command: ``` MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 numactl -m 0 -C 3 ./buck-out/opt/gen/caffe2/caffe2/fb/predictor/ptvsc2_predictor_bench --scripted_model=/home/djang/ads/adfinder/ctr_mobilefeed/302008423/302008423_0.predictor.disagg.local --recordio_inputs=/home/djang/ads/adfinder/ctr_mobilefeed/302008423/302008423.local.inputs.recordio --pt_enable_static_runtime=1 --compare_results=0 --iters=1 --warmup_iters=1 --num_threads=1 --do_profile=1 --method_name=local.forward --set_compatibility --do_benchmark=1 --recordio_use_ivalue_format=1 ``` Reviewed By: hlu1 Differential Revision: D31947382 fbshipit-source-id: 4188271613aad201f8cad5f566e0dfed26680968
2025-12-06 00:20:18 +01:00 · 2021-10-29 14:36:54 -07:00 · 2021-10-29 14:36:54 -07:00 · ad89d994c9
commit ad89d994c9
parent 2cac92f470
3 changed files with 115 additions and 78 deletions
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@ -1000,18 +1000,26 @@ std::string generate_latency_json(const std::string& label, double millis) {
 } // namespace

 void StaticRuntime::benchmark(
-    const std::vector<c10::IValue>& args,
-    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const std::vector<std::vector<c10::IValue>>& args_list,
+    const std::vector<std::unordered_map<std::string, c10::IValue>>&
+        kwargs_list,
    const int warmup_runs,
    const int main_runs,
    bool print_per_node_time,
    bool generate_ai_pep_output) {
-  float time_per_iter = benchmark_model(args, kwargs, warmup_runs, main_runs);
+  TORCH_CHECK(
+      kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
+  std::cout << "Input size: " << args_list.size() << std::endl;
+  if (args_list.size() == 0) {
+    return;
+  }
+  float time_per_iter =
+      benchmark_model(args_list, kwargs_list, warmup_runs, main_runs);
  std::cout << "Static runtime ms per iter: " << time_per_iter
            << ". Iters per second: " << 1000.0 / time_per_iter << std::endl;

  IndividualMetrics results =
-      benchmark_individual_ops(args, kwargs, warmup_runs, main_runs);
+      benchmark_individual_ops(args_list, kwargs_list, warmup_runs, main_runs);

  if (print_per_node_time) {
    for (const auto i : c10::irange(nodes_.size())) {
@ -1089,28 +1097,39 @@ void StaticRuntime::benchmark(
  check_for_memory_leak();

 #ifndef NDEBUG
-  display_nodes(args, kwargs);
+  std::unordered_map<std::string, c10::IValue> empty_kwargs;
+  display_nodes(
+      args_list[0], kwargs_list.size() > 0 ? kwargs_list[0] : empty_kwargs);
 #endif
 }

 float StaticRuntime::benchmark_model(
-    const std::vector<c10::IValue>& args,
-    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const std::vector<std::vector<c10::IValue>>& args_list,
+    const std::vector<std::unordered_map<std::string, c10::IValue>>&
+        kwargs_list,
    const int warmup_runs,
    const int main_runs) {
  TORCH_CHECK(warmup_runs >= 0 && main_runs >= 1);
+  TORCH_CHECK(
+      kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());

+  const bool is_kwargs_empty = kwargs_list.size() == 0;
+  const std::unordered_map<std::string, c10::IValue> empty_kwargs;
  for (const auto i : c10::irange(warmup_runs)) {
    (void)i; // Suppress unused variable warning
-    operator()(args, kwargs);
+    for (const auto j : c10::irange(args_list.size())) {
+      operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
+    }
  }
  caffe2::Timer timer;
  for (const auto i : c10::irange(main_runs)) {
    (void)i; // Suppress unused variable warning
-    operator()(args, kwargs);
+    for (const auto j : c10::irange(args_list.size())) {
+      operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
+    }
  }
  float millis = timer.MilliSeconds();
-  return millis / static_cast<float>(main_runs);
+  return millis / (static_cast<float>(main_runs) * args_list.size());
 }

 bool display_ivalue(const IValue& iv) {
@ -1190,11 +1209,20 @@ void StaticRuntime::display_nodes(
 }

 StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
-    const std::vector<c10::IValue>& args,
-    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const std::vector<std::vector<c10::IValue>>& args_list,
+    const std::vector<std::unordered_map<std::string, c10::IValue>>&
+        kwargs_list,
    const int warmup_runs,
    const int main_runs) {
+  TORCH_CHECK(
+      kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
  TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1);
+  if (args_list.size() == 0) {
+    return {};
+  }
+
+  const bool is_kwargs_empty = kwargs_list.size() == 0;
+  const std::unordered_map<std::string, c10::IValue> empty_kwargs;

  // See comment on above use of InferenceMode for
  // explanation.
@ -1206,7 +1234,7 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
  // setup time
  caffe2::Timer timer;

-  set_inputs(args, kwargs);
+  set_inputs(args_list[0], is_kwargs_empty ? empty_kwargs : kwargs_list[0]);

  results.setup_time = timer.MilliSeconds();

@ -1214,74 +1242,80 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
  // initializes the memory planner with the profile information. Folllowing
  // iterations just use the already established memory planning.
  timer.Start();
-  operator()(args, kwargs);
+  operator()(args_list[0], is_kwargs_empty ? empty_kwargs : kwargs_list[0]);
  results.first_iter_time = timer.MilliSeconds();

  // warmup runs
  for (const auto i : c10::irange(warmup_runs - 1)) {
    (void)i; // Suppress unused variable warning
-    operator()(args, kwargs);
+    for (const auto j : c10::irange(args_list.size())) {
+      operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
+    }
  }

  // main runs
-  for (const auto k : c10::irange(main_runs)) {
-    (void)k; // Suppress unused variable warning
+  for (const auto i : c10::irange(main_runs)) {
+    (void)i; // Suppress unused variable warning

-    set_inputs(args, kwargs);
+    for (const auto j : c10::irange(args_list.size())) {
+      set_inputs(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);

-    timer.Start();
-    if (planner_) {
-      planner_->allocate();
-    }
-    float millis = timer.MilliSeconds();
-    results.memory_alloc_time += millis;
-
-    for (const auto i : c10::irange(nodes_.size())) {
      timer.Start();
-      nodes_[i].run();
-      millis = timer.MilliSeconds();
-      results.time_per_node[i] += millis;
-    }
-    timer.Start();
-    if (static_module_.opts().cleanup_activations) {
-      create_memory_planner();
-      planner_->deallocate();
-      // clean up owning refs of input tensors
-      clean_up_input_ivalues();
-    }
-    millis = timer.MilliSeconds();
-    results.memory_dealloc_time += millis;
-
-    timer.Start();
-    // no need to keep references of outputs in static runtime anymore
-    c10::IValue output;
-    if (static_module_.num_outputs() > 1) {
-      std::vector<c10::IValue> outputs;
-      outputs.reserve(static_module_.num_outputs());
-      for (const auto i : c10::irange(static_module_.num_outputs())) {
-        // use move here. Otherwise, clean up outputs_[i] explicitly
-        outputs.emplace_back(std::move(*outputs_[i]));
+      if (planner_) {
+        planner_->allocate();
+      }
+      float millis = timer.MilliSeconds();
+      results.memory_alloc_time += millis;
+
+      for (const auto k : c10::irange(nodes_.size())) {
+        timer.Start();
+        nodes_[k].run();
+        millis = timer.MilliSeconds();
+        results.time_per_node[k] += millis;
+      }
+      timer.Start();
+      if (static_module_.opts().cleanup_activations) {
+        create_memory_planner();
+        planner_->deallocate();
+        // clean up owning refs of input tensors
+        clean_up_input_ivalues();
+      }
+      millis = timer.MilliSeconds();
+      results.memory_dealloc_time += millis;
+
+      timer.Start();
+      // no need to keep references of outputs in static runtime anymore
+      c10::IValue output;
+      if (static_module_.num_outputs() > 1) {
+        std::vector<c10::IValue> outputs;
+        outputs.reserve(static_module_.num_outputs());
+        for (const auto k : c10::irange(static_module_.num_outputs())) {
+          // use move here. Otherwise, clean up outputs_[i] explicitly
+          outputs.emplace_back(std::move(*outputs_[k]));
+        }
+        output = c10::ivalue::Tuple::create(std::move(outputs));
      }
-      output = c10::ivalue::Tuple::create(std::move(outputs));
-    }

 #ifndef NDEBUG
-    check_for_memory_leak(false);
+      check_for_memory_leak(false);
 #endif

-    // use move here. Otherwise, clean up outputs_[0] explicitly
-    output = std::move(*outputs_[0]);
-    // release outputs explicitly to measure the time it takes
-    output = IValue();
-    millis = timer.MilliSeconds();
-    results.output_dealloc_time += millis;
+      // use move here. Otherwise, clean up outputs_[0] explicitly
+      output = std::move(*outputs_[0]);
+      // release outputs explicitly to measure the time it takes
+      output = IValue();
+      millis = timer.MilliSeconds();
+      results.output_dealloc_time += millis;
+    }
  }

  // post processing
+  const float num_total_iters =
+      (static_cast<float>(main_runs) * args_list.size());
  for (const auto i : c10::irange(nodes_.size())) {
    const Node* node = nodes_[i].node();
    std::string kind = std::string(node->kind().toQualString());
-    results.time_per_node[i] /= static_cast<float>(main_runs);
+    results.time_per_node[i] /= num_total_iters;
    results.time_per_node_type[kind] += results.time_per_node[i];
    results.instances_per_node_type[kind]++;
    if (nodes_[i].has_out_variant()) {
@ -1293,9 +1327,9 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
    results.total_time += results.time_per_node[i];
  }
  results.total_nodes_count = nodes_.size();
-  results.memory_alloc_time /= static_cast<float>(main_runs);
-  results.memory_dealloc_time /= static_cast<float>(main_runs);
-  results.output_dealloc_time /= static_cast<float>(main_runs);
+  results.memory_alloc_time /= num_total_iters;
+  results.memory_dealloc_time /= num_total_iters;
+  results.output_dealloc_time /= num_total_iters;
  for (const auto& p : results.time_per_node_type) {
    const std::string& kind = p.first;
    results.percent_per_node_type[kind] = p.second / results.total_time * 100;
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@ -277,24 +277,15 @@ class TORCH_API StaticRuntime {
      std::vector<c10::IValue>&& args,
      const std::unordered_map<std::string, c10::IValue>& kwargs);

-  void display_nodes(
-      const std::vector<c10::IValue>& args,
-      const std::unordered_map<std::string, c10::IValue>& kwargs);
-
  void benchmark(
-      const std::vector<c10::IValue>& args,
-      const std::unordered_map<std::string, c10::IValue>& kwargs,
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<std::unordered_map<std::string, c10::IValue>>&
+          kwargs_list,
      const int warmup_runs,
      const int main_runs,
      bool print_per_node_time = false,
      bool generate_ai_pep_output = false);

-  float benchmark_model(
-      const std::vector<c10::IValue>& args,
-      const std::unordered_map<std::string, c10::IValue>& kwargs,
-      const int warmup_runs,
-      const int main_runs);
-
  struct IndividualMetrics {
    float setup_time{0.0};
    float memory_alloc_time{0.0};
@ -313,8 +304,9 @@ class TORCH_API StaticRuntime {
  };

  IndividualMetrics benchmark_individual_ops(
-      const std::vector<c10::IValue>& args,
-      const std::unordered_map<std::string, c10::IValue>& kwargs,
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<std::unordered_map<std::string, c10::IValue>>&
+          kwargs_list,
      const int warmup_runs,
      const int main_runs);

@ -383,6 +375,17 @@ class TORCH_API StaticRuntime {

  void create_memory_planner();

+  float benchmark_model(
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<std::unordered_map<std::string, c10::IValue>>&
+          kwargs_list,
+      const int warmup_runs,
+      const int main_runs);
+
+  void display_nodes(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs);
+
  // Memory planning is only enabled if sm->opts().cleanup_activations is true.
  // Otherwise, the memory used by activations is cached inside the static
  // runtime.
--- a/torch/csrc/jit/runtime/static/init.cpp
+++ b/torch/csrc/jit/runtime/static/init.cpp
@ -68,7 +68,7 @@ void initStaticModuleBindings(PyObject* module) {
            std::unordered_map<std::string, c10::IValue> kwarg_ivalues{
                kwargs.begin(), kwargs.end()};
            self.runtime().benchmark(
-                arg_ivalues, kwarg_ivalues, warmup_runs, main_runs);
+                {arg_ivalues}, {kwarg_ivalues}, warmup_runs, main_runs);
          })
      .def(
          "benchmark_individual_ops",
@ -81,7 +81,7 @@ void initStaticModuleBindings(PyObject* module) {
            std::unordered_map<std::string, c10::IValue> kwarg_ivalues{
                kwargs.begin(), kwargs.end()};
            return self.runtime().benchmark_individual_ops(
-                arg_ivalues, kwarg_ivalues, warmup_runs, main_runs);
+                {arg_ivalues}, {kwarg_ivalues}, warmup_runs, main_runs);
          });
  m.def(
       "_jit_to_static_module",