From 31b64fc3e6e9574ad472b9093fb0db4902b0abc5 Mon Sep 17 00:00:00 2001 From: David Berard Date: Tue, 8 Mar 2022 08:31:53 -0800 Subject: [PATCH] [JIT] log extract tool - dump NVFuser fallbacks instead of fusion groups (#73881) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73881 NVFuser fusion groups can contain nvfuser-only ops, e.g. `prim::reshape_copy`. Previously, we couldn't get a baseline performance measurement because the nvfuser-only ops would error out on nnc- and no-fusion- runs. Instead, dump the fallback graphs, after the fallbacks are corrected into runnable fallbacks. Test Plan: Imported from OSS Reviewed By: eellison Differential Revision: D34698307 Pulled By: davidberard98 fbshipit-source-id: c357b2736b789bfd347afe9c83a1b610b64881e0 (cherry picked from commit 5918d826502ff75fbc22d242844ae6435dd7d22a) --- scripts/jit/log_extract.py | 17 +++++++++++------ torch/csrc/jit/codegen/cuda/graph_fuser.cpp | 14 ++++++++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/jit/log_extract.py b/scripts/jit/log_extract.py index de9f983745c..aad64c9b46e 100644 --- a/scripts/jit/log_extract.py +++ b/scripts/jit/log_extract.py @@ -2,7 +2,9 @@ from contextlib import contextmanager from torch.testing import make_tensor from typing import Any, List, Tuple import argparse +import random import torch +import traceback ''' Usage: @@ -52,9 +54,9 @@ def load_graph_and_inputs(ir: str) -> Tuple[Any, List[Any]]: inputs = [] for inp in graph.inputs(): if isinstance(inp.type(), torch._C.FloatType): - inputs.append(.5) + inputs.append(random.uniform(.1, 100)) elif isinstance(inp.type(), torch._C.IntType): - inputs.append(2) + inputs.append(random.randint(1, 100)) elif isinstance(inp.type(), torch._C.TensorType): inputs.append(make_tensor_from_type(inp.type())) else: @@ -123,10 +125,13 @@ def run_nvfuser(ir, inputs) -> float: def test_nvfuser(graphs: List[str], baseline_fn, nvfuser_fn): for i, ir in enumerate(graphs): _, inputs = load_graph_and_inputs(ir) - baseline = baseline_fn(ir, inputs) - nvfuser = nvfuser_fn(ir, inputs) - improvement = (baseline / nvfuser - 1) * 100 - print(f" Graph {i}; baseline: {baseline:.2f} ms; nvfuser: {nvfuser:.2f} ms; improvement: {improvement:.2f}%") + try: + baseline = baseline_fn(ir, inputs) + nvfuser = nvfuser_fn(ir, inputs) + improvement = (baseline / nvfuser - 1) * 100 + print(f" Graph {i}; baseline: {baseline:.2f} ms; nvfuser: {nvfuser:.2f} ms; improvement: {improvement:.2f}%") + except RuntimeError: + print(f" Graph {i} failed:", traceback.format_exc()) def run(): diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp index dee3fa50fb4..f3e50ce06dc 100644 --- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp +++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp @@ -1710,11 +1710,15 @@ void guardFusionGroups( // c. restore conditional constant to non-constant for fallback guardFusionGroup(fusion, fusion_value_to_runtime_size); } +} - if (GRAPH_DEBUG_ENABLED) { - GRAPH_DEBUG("Exporting all NVFuser fusions:"); - for (Node* fusion : fusions) { - GRAPH_EXPORT("", fusion->g(attr::Subgraph)); +void dumpFusionGroups(std::shared_ptr& g) { + DepthFirstGraphNodeIterator it(g); + Node* n = nullptr; + GRAPH_DEBUG("Exporting all NVFuser fusions:"); + while ((n = it.next()) != nullptr) { + if (n->kind() == prim::FallbackGraph) { + GRAPH_EXPORT("", n->g(attr::Subgraph)); } } } @@ -2305,6 +2309,8 @@ void CudaFuseGraph(std::shared_ptr& graph) { revertAliasCopyOps(graph, graph->block()); GRAPH_DEBUG("revert alias_copy ops by nvfuser: ", *graph); + dumpFusionGroups(graph); + // After FuseGraph some common subexpressions may come back EliminateCommonSubexpression(graph); // We might have emitted a fair amount of useless shape propagating code, so