[JIT] log extract tool - dump NVFuser fallbacks instead of fusion groups (#73881)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73881 NVFuser fusion groups can contain nvfuser-only ops, e.g. `prim::reshape_copy`. Previously, we couldn't get a baseline performance measurement because the nvfuser-only ops would error out on nnc- and no-fusion- runs. Instead, dump the fallback graphs, after the fallbacks are corrected into runnable fallbacks. Test Plan: Imported from OSS Reviewed By: eellison Differential Revision: D34698307 Pulled By: davidberard98 fbshipit-source-id: c357b2736b789bfd347afe9c83a1b610b64881e0 (cherry picked from commit 5918d826502ff75fbc22d242844ae6435dd7d22a)
2025-12-06 12:20:52 +01:00 · 2022-03-08 08:31:53 -08:00 · 2022-03-08 08:31:53 -08:00 · 31b64fc3e6
commit 31b64fc3e6
parent 56164c07c4
2 changed files with 21 additions and 10 deletions
--- a/scripts/jit/log_extract.py
+++ b/scripts/jit/log_extract.py
@ -2,7 +2,9 @@ from contextlib import contextmanager
 from torch.testing import make_tensor
 from typing import Any, List, Tuple
 import argparse
+import random
 import torch
+import traceback

 '''
 Usage:
@ -52,9 +54,9 @@ def load_graph_and_inputs(ir: str) -> Tuple[Any, List[Any]]:
    inputs = []
    for inp in graph.inputs():
        if isinstance(inp.type(), torch._C.FloatType):
-            inputs.append(.5)
+            inputs.append(random.uniform(.1, 100))
        elif isinstance(inp.type(), torch._C.IntType):
-            inputs.append(2)
+            inputs.append(random.randint(1, 100))
        elif isinstance(inp.type(), torch._C.TensorType):
            inputs.append(make_tensor_from_type(inp.type()))
        else:
@ -123,10 +125,13 @@ def run_nvfuser(ir, inputs) -> float:
 def test_nvfuser(graphs: List[str], baseline_fn, nvfuser_fn):
    for i, ir in enumerate(graphs):
        _, inputs = load_graph_and_inputs(ir)
-        baseline = baseline_fn(ir, inputs)
-        nvfuser = nvfuser_fn(ir, inputs)
-        improvement = (baseline / nvfuser - 1) * 100
-        print(f"  Graph {i}; baseline: {baseline:.2f} ms; nvfuser: {nvfuser:.2f} ms; improvement: {improvement:.2f}%")
+        try:
+            baseline = baseline_fn(ir, inputs)
+            nvfuser = nvfuser_fn(ir, inputs)
+            improvement = (baseline / nvfuser - 1) * 100
+            print(f"  Graph {i}; baseline: {baseline:.2f} ms; nvfuser: {nvfuser:.2f} ms; improvement: {improvement:.2f}%")
+        except RuntimeError:
+            print(f"  Graph {i} failed:", traceback.format_exc())


 def run():
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@ -1710,11 +1710,15 @@ void guardFusionGroups(
    //         c. restore conditional constant to non-constant for fallback
    guardFusionGroup(fusion, fusion_value_to_runtime_size);
  }
+}

-  if (GRAPH_DEBUG_ENABLED) {
-    GRAPH_DEBUG("Exporting all NVFuser fusions:");
-    for (Node* fusion : fusions) {
-      GRAPH_EXPORT("", fusion->g(attr::Subgraph));
+void dumpFusionGroups(std::shared_ptr<Graph>& g) {
+  DepthFirstGraphNodeIterator it(g);
+  Node* n = nullptr;
+  GRAPH_DEBUG("Exporting all NVFuser fusions:");
+  while ((n = it.next()) != nullptr) {
+    if (n->kind() == prim::FallbackGraph) {
+      GRAPH_EXPORT("", n->g(attr::Subgraph));
    }
  }
 }
@ -2305,6 +2309,8 @@ void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
  revertAliasCopyOps(graph, graph->block());
  GRAPH_DEBUG("revert alias_copy ops by nvfuser: ", *graph);

+  dumpFusionGroups(graph);
+
  // After FuseGraph some common subexpressions may come back
  EliminateCommonSubexpression(graph);
  // We might have emitted a fair amount of useless shape propagating code, so