mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
https://github.com/pytorch/pytorch/issues/148222 Goal: At the moment autograd saved tensors hooks are run in eager after compiled forward. They are executed at the same time for all saved tensors. Hooks can be used to reduce amout of memory used for saved tensors, doing quantization or offloading to cpu. This is suboptimal for optimization of peak memory. Better solution will be to put the hooks in the graph, as close as possible to the last usage of the tensor. To get user specified autograd saved tensors hooks in the graph. Logic: UX: If user specifies with torch.autograd.graph.saved_tensors_hooks(pack_gm, unpack_gm). Where pack_gm and unpack_gm are torch.fx.GraphModule. Then AotAutograd will retrace those graph modules, doing decompositions and functionalization in aot_autograd, inlining the result graphs in forward epilogue and backward prologue. User may want to use control logic in the hooks, for example applying quantization only for specific dtypes and sizes. This is also possible, user can put it into torch.fx.wrap function and use symbolic trace to make a GraphModule. In that case AotAutograd cahing will work only in case when user explicitly set to the torch.fx.wrap call_function node "user_cache_hash" metadata. If this metadata set - then aot_autograd cache can use saved cache artifact. If metadata is not set - then cache is bypassed. Dynamo: Dynamo traces pack and unpack hooks and installs them as subgraph and explicitly adds to the output_graph. (As those subgraphs are not used and will not be copied in the result by default). The complexity here is that at this moment we do not have example of inputs for the hooks. We trace pack_hook with some Tensor from the inputs. The result subgraphs are added to the hashing of AotAutograd Cache. In AotAutograd we retrace the graph with the true saved tensors coming from partitioner. Backwards Compatibility: As current hooks are executed in eager mode and not all of them will be traceable - we only try to put in the graph hooks, explicitly marked by user with annotation (@_inlineable_saved_tensors_hooks). For other hooks or if compiled autograd is enabled - keep the same logic. Recompilations: Hooks are guarded with lambda guard matching function id to cause recompilation if user reruns compiled function. Aot_autograd: After partitioner prepared forward and backward module - we trace prepared at Dynamo graphs for pack and unpack hooks and inline them in epilogue of forward and prologue of backward. Forward outputs and backward inputs are changed, transparently for user. We do not try to put it close the last usage etc., relying on inductor to do this optimization. ``` INFO: TRACED GRAPH ===== Forward graph pre saved_tensors_hooks inlining 3 ===== /data/users/ivankobzarev/a/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module): def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1][s1, 1]cuda:0"): # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6660 in simple_fn, code: x = x + 1 add: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.add.Tensor(primals_3, 1); primals_3 = None # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6661 in simple_fn, code: x = SAF.apply(x) view: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.view.default(add, [primals_1, primals_2]) return (view, add, primals_1, primals_2) INFO: TRACED GRAPH ===== Backward graph pre saved_tensors_hooks inlining 3 ===== /data/users/ivankobzarev/a/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module): def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1][s1, 1]cuda:0"): # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6660 in simple_fn, code: x = x + 1 add: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.add.Tensor(primals_3, 1); primals_3 = None # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6661 in simple_fn, code: x = SAF.apply(x) view: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.view.default(add, [primals_1, primals_2]) return (view, add, primals_1, primals_2) INFO: TRACED GRAPH ===== saved_tensors_pack_hook add 3 ===== /data/users/ivankobzarev/a/pytorch/torch/fx/_lazy_graph_module.py class pack_float8(torch.nn.Module): def forward(self, x_1: "f32[s0, s1][s1, 1]cuda:0"): # No stacktrace found for following nodes _to_copy: "f8e4m3fn[s0, s1][s1, 1]cuda:0" = torch.ops.aten._to_copy.default(x_1, dtype = torch.float8_e4m3fn); x_1 = None return (torch.float32, _to_copy) INFO: TRACED GRAPH ===== saved_tensors_unpack_hook add 3 ===== <eval_with_key>.22 from /data/users/ivankobzarev/a/pytorch/torch/fx/experimental/proxy_tensor.py:1225 in wrapped class pack_float8(torch.nn.Module): def forward(self, x_1: "f32[s0, s1][s1, 1]cuda:0"): # No stacktrace found for following nodes _to_copy: "f8e4m3fn[s0, s1][s1, 1]cuda:0" = torch.ops.aten._to_copy.default(x_1, dtype = torch.float8_e4m3fn); x_1 = None return (torch.float32, _to_copy) INFO: TRACED GRAPH ===== Forward graph 3 ===== /data/users/ivankobzarev/a/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module): def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1][s1, 1]cuda:0"): # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6660 in simple_fn, code: x = x + 1 add: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.add.Tensor(primals_3, 1); primals_3 = None # No stacktrace found for following nodes _to_copy: "f8e4m3fn[s0, s1][s1, 1]cuda:0" = torch.ops.aten._to_copy.default(add, dtype = torch.float8_e4m3fn) # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6661 in simple_fn, code: x = SAF.apply(x) view: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.view.default(add, [primals_1, primals_2]); add = None return (view, _to_copy, primals_1, primals_2) INFO: TRACED GRAPH ===== Backward graph 3 ===== <eval_with_key>.21 class GraphModule(torch.nn.Module): def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", add_packed_2: "f8e4m3fn[s0, s1][s1, 1]cuda:0", tangents_1: "f32[s0, s1][s1, 1]cuda:0"): # No stacktrace found for following nodes _to_copy: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten._to_copy.default(add_packed_2, dtype = torch.float32); add_packed_2 = None # File: /data/users/ivankobzarev/a/pytorch/test/functorch/test_aotdispatch.py:6661 in simple_fn, code: x = SAF.apply(x) add_7: "f32[s0, s1][s1, 1]cuda:0" = torch.ops.aten.add.Tensor(tangents_1, _to_copy); tangents_1 = _to_copy = None return (None, None, add_7) ``` Differential Revision: [D72187044](https://our.internmc.facebook.com/intern/diff/D72187044) Pull Request resolved: https://github.com/pytorch/pytorch/pull/150032 Approved by: https://github.com/bdhirsh
688 lines
24 KiB
Python
688 lines
24 KiB
Python
# Owner(s): ["module: dynamo"]
|
|
import dataclasses
|
|
import pprint
|
|
import sys
|
|
from unittest import mock
|
|
|
|
import torch
|
|
import torch._dynamo.config as dynamo_config
|
|
import torch._inductor.config as inductor_config
|
|
from torch._dynamo import utils
|
|
from torch._inductor.test_case import TestCase
|
|
|
|
|
|
class TestUtils(TestCase):
|
|
def test_nan(self):
|
|
a = torch.Tensor([float("nan")])
|
|
b = torch.Tensor([float("nan")])
|
|
fp64_ref = torch.DoubleTensor([5.0])
|
|
res = utils.same(a, b, fp64_ref=fp64_ref, equal_nan=True)
|
|
self.assertTrue(res)
|
|
|
|
def test_larger_multiplier_for_smaller_tensor(self):
|
|
"""
|
|
Tensor numel between (10, 500]
|
|
"""
|
|
N = 100
|
|
fp64_ref = torch.full([N], 0.0, dtype=torch.double)
|
|
a = torch.full([N], 1.0)
|
|
tol = 4 * 1e-2
|
|
self.assertTrue(utils.same(a, a * 2, fp64_ref=fp64_ref, tol=tol))
|
|
self.assertFalse(utils.same(a, a * 4, fp64_ref=fp64_ref, tol=tol))
|
|
self.assertTrue(
|
|
utils.same(
|
|
a,
|
|
a * 4,
|
|
fp64_ref=fp64_ref,
|
|
use_larger_multiplier_for_smaller_tensor=True,
|
|
tol=tol,
|
|
)
|
|
)
|
|
self.assertFalse(
|
|
utils.same(
|
|
a,
|
|
a * 9,
|
|
fp64_ref=fp64_ref,
|
|
use_larger_multiplier_for_smaller_tensor=True,
|
|
tol=tol,
|
|
)
|
|
)
|
|
|
|
def test_larger_multiplier_for_even_smaller_tensor(self):
|
|
"""
|
|
Tesnor numel <=10
|
|
"""
|
|
fp64_ref = torch.DoubleTensor([0.0])
|
|
a = torch.Tensor([1.0])
|
|
tol = 4 * 1e-2
|
|
self.assertTrue(utils.same(a, a * 2, fp64_ref=fp64_ref, tol=tol))
|
|
self.assertFalse(utils.same(a, a * 7, fp64_ref=fp64_ref, tol=tol))
|
|
self.assertTrue(
|
|
utils.same(
|
|
a,
|
|
a * 7,
|
|
fp64_ref=fp64_ref,
|
|
use_larger_multiplier_for_smaller_tensor=True,
|
|
tol=tol,
|
|
)
|
|
)
|
|
self.assertFalse(
|
|
utils.same(
|
|
a,
|
|
a * 20,
|
|
fp64_ref=fp64_ref,
|
|
use_larger_multiplier_for_smaller_tensor=True,
|
|
tol=tol,
|
|
)
|
|
)
|
|
|
|
@dynamo_config.patch(
|
|
{
|
|
"log_compilation_metrics": True,
|
|
"inline_inbuilt_nn_modules": False,
|
|
}
|
|
)
|
|
def test_graph_break_counting(self):
|
|
"""
|
|
Run a compilation that includes a graph break and validate that the
|
|
graph break counter is incremented.
|
|
"""
|
|
|
|
def run_forward_backward():
|
|
model = torch.compile(TestModel())
|
|
x = torch.rand([3], requires_grad=True)
|
|
output = model(x)
|
|
loss_fn = torch.nn.MSELoss()
|
|
target = torch.tensor([1.0])
|
|
loss = loss_fn(output, target)
|
|
loss.backward()
|
|
|
|
@torch.compile
|
|
def add(x, y):
|
|
return x + y
|
|
|
|
@torch.compile
|
|
def break_it(x):
|
|
y = x.sum()
|
|
if y > 0:
|
|
return x + y.item()
|
|
return x - y.item()
|
|
|
|
@torch.compile
|
|
def break_it2(x):
|
|
y = x.sum()
|
|
if y > 0:
|
|
if y > 1:
|
|
return x * y.item()
|
|
return x + y.item()
|
|
return x - y.item()
|
|
|
|
add(torch.rand([10]), torch.rand([10]))
|
|
utils.reset_frame_count()
|
|
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
run_forward_backward()
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[-1].num_graph_breaks, 0)
|
|
|
|
# We should fallback to normal mode and increment the graph break counter
|
|
torch.compile(break_it, backend="inductor")(torch.ones(3, 3))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[-1].num_graph_breaks, 1)
|
|
|
|
# Graph break counter should be incremented by 1 (after a reset), not 2
|
|
torch.compile(break_it, backend="inductor")(torch.ones(3, 3))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[-1].num_graph_breaks, 1)
|
|
|
|
# Graph break counter should be incremented by 2
|
|
torch.compile(break_it2, backend="inductor")(torch.ones(3, 3))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[-1].num_graph_breaks, 2)
|
|
|
|
|
|
class TestModel(torch.nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.linear = torch.nn.Linear(3, 1)
|
|
|
|
def forward(self, x):
|
|
return self.linear(x)
|
|
|
|
|
|
class TestDynamoTimed(TestCase):
|
|
"""
|
|
Test utilities surrounding dynamo_timed.
|
|
"""
|
|
|
|
def run_forward_backward(self):
|
|
model = torch.compile(TestModel())
|
|
x = torch.rand([3], requires_grad=True)
|
|
output = model(x)
|
|
loss_fn = torch.nn.MSELoss()
|
|
target = torch.tensor([1.0])
|
|
loss = loss_fn(output, target)
|
|
loss.backward()
|
|
|
|
def warmup(self):
|
|
# Helper to make sure any process-global lru_caches (e.g., torch_key())
|
|
# have already executed. Just compile something.
|
|
@torch.compile
|
|
def add(x, y):
|
|
return x + y
|
|
|
|
add(torch.rand([10]), torch.rand([10]))
|
|
utils.reset_frame_count()
|
|
torch._logging._internal.structured_logging_overhead.clear()
|
|
|
|
@dynamo_config.patch(
|
|
{
|
|
"log_compilation_metrics": True,
|
|
"inline_inbuilt_nn_modules": False,
|
|
}
|
|
)
|
|
@inductor_config.patch(
|
|
{
|
|
"bundle_triton_into_fx_graph_cache": False,
|
|
"bundled_autotune_remote_cache": False,
|
|
}
|
|
)
|
|
# We can't easily test that timing is actually accurate. Mock time to always
|
|
# return the same value; all durations will be zero.
|
|
@mock.patch("time.time", return_value=0.001)
|
|
@mock.patch("time.time_ns", return_value=100000)
|
|
@dynamo_config.patch(specialize_float=False)
|
|
def test_dynamo_timed(self, mock_time, mock_time_ns):
|
|
"""
|
|
Run a compilation that includes a forward and a backward and validate
|
|
various recorded metrics. This test could be broken into several, but the
|
|
compilation is somewhat expensive. Instead of resetting and compiling the
|
|
same thing multiple times, we may as well compile once and just check all
|
|
the things that are affected by dynamo_timed.
|
|
"""
|
|
self.warmup()
|
|
|
|
# The logging function is different for OSS vs. internal. Let's just mock
|
|
# and capture all the CompilationMetric objects logged.
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
self.run_forward_backward()
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
|
|
# Validate utils.compile_times(). Unfortunately, we can't test the output
|
|
# reliably because it depends on whether 'tabulate' is installed. So we'll
|
|
# directly inspect the dict it prints instead:
|
|
self.assertExpectedInline(
|
|
pprint.pformat(utils.compilation_time_metrics),
|
|
"""\
|
|
{'GraphLowering.codegen': [0.0, 0.0],
|
|
'GraphLowering.compile_to_fn': [0.0, 0.0],
|
|
'GraphLowering.compile_to_module': [0.0, 0.0],
|
|
'GraphLowering.run': [0.0, 0.0],
|
|
'OutputGraph.call_user_compiler': [0.0],
|
|
'PyCodeCache.load_by_key_path': [0.0, 0.0],
|
|
'PythonWrapperCodegen.generate': [0.0, 0.0],
|
|
'Scheduler.__init__': [0.0, 0.0],
|
|
'Scheduler.codegen': [0.0, 0.0],
|
|
'Scheduler.fused_nodes': [0.0, 0.0],
|
|
'_compile.compile_inner': [0.0],
|
|
'_recursive_joint_graph_passes': [0.0],
|
|
'_recursive_post_grad_passes': [0.0, 0.0],
|
|
'_recursive_pre_grad_passes': [0.0],
|
|
'additional_fake_tensor_prop': [0.0, 0.0],
|
|
'aot_collect_metadata': [0.0],
|
|
'aot_trace_joint_graph': [0.0],
|
|
'async_compile.wait': [0.0, 0.0],
|
|
'backward._backward_impl': [0.0],
|
|
'build_guards': [0.0],
|
|
'bytecode_tracing': [0.0],
|
|
'compile_attempt_0': [0.0],
|
|
'compile_file': [0.0, 0.0],
|
|
'compile_fx.<locals>.bw_compiler': [0.0],
|
|
'compile_fx.<locals>.fw_compiler_base': [0.0],
|
|
'compile_fx_inner': [0.0, 0.0],
|
|
'create_aot_dispatcher_function': [0.0],
|
|
'fx_codegen_and_compile': [0.0, 0.0],
|
|
'gc': [0.0],
|
|
'min_cut_rematerialization_partition': [0.0]}""", # noqa: B950
|
|
)
|
|
|
|
# Now validate utils.calculate_time_spent(). Formatting the return
|
|
# value makes reading diffs much easier.
|
|
time_spent = utils.calculate_time_spent()
|
|
self.assertExpectedInline(
|
|
pprint.pformat(time_spent),
|
|
"""\
|
|
{'_recursive_joint_graph_passes': 0.0,
|
|
'_recursive_post_grad_passes': 0.0,
|
|
'_recursive_pre_grad_passes': 0.0,
|
|
'async_compile.wait': 0.0,
|
|
'backend_compile': 0.0,
|
|
'code_gen': 0.0,
|
|
'entire_backward_compile': 0.0,
|
|
'entire_frame_compile': 0.0,
|
|
'gc': 0.0,
|
|
'inductor_compile': 0.0,
|
|
'total_wall_time': 0.0}""", # noqa: B950
|
|
)
|
|
|
|
# Now validate the CompilationMetrics logs. We expect a log for the
|
|
# forward and a log for the backward.
|
|
self.assertTrue(len(compilation_events) == 2)
|
|
self.assertTrue(
|
|
all(isinstance(e, utils.CompilationMetrics) for e in compilation_events)
|
|
)
|
|
|
|
# Remove a few fields that aren't helpful for test stability.
|
|
for e in compilation_events:
|
|
e.dynamo_config = None
|
|
e.co_filename = None
|
|
e.co_firstlineno = None
|
|
e.inductor_config = None
|
|
e.cuda_version = None
|
|
e.triton_version = None
|
|
e.python_version = None
|
|
|
|
# First event is for the forward. Formatting makes reading diffs
|
|
# much easier.
|
|
raw = dataclasses.asdict(compilation_events[0])
|
|
del raw["feature_usage"]
|
|
del raw["ir_count"]
|
|
del raw["param_numel"]
|
|
del raw["param_bytes"]
|
|
del raw["param_count"]
|
|
# guard_latency_us is not deterministic
|
|
del raw["guard_latency_us"]
|
|
self.assertExpectedInline(
|
|
pprint.pformat(raw),
|
|
"""\
|
|
{'accumulated_cache_size': 0,
|
|
'aot_autograd_cumulative_compile_time_us': 0,
|
|
'backend_compile_time_s': 0.0,
|
|
'backward_cumulative_compile_time_us': None,
|
|
'cache_size': 0,
|
|
'co_filename': None,
|
|
'co_firstlineno': None,
|
|
'co_name': 'forward',
|
|
'code_gen_time_s': 0.0,
|
|
'compile_id': '1/0',
|
|
'compile_time_autotune_time_us': None,
|
|
'compliant_custom_ops': set(),
|
|
'config_inline_inbuilt_nn_modules': False,
|
|
'config_suppress_errors': False,
|
|
'cuda_version': None,
|
|
'cudagraph_skip_reason': None,
|
|
'distributed_ephemeral_timeout_us': None,
|
|
'duration_us': 0,
|
|
'dynamo_compile_time_before_restart_us': 0,
|
|
'dynamo_config': None,
|
|
'dynamo_cumulative_compile_time_us': 0,
|
|
'dynamo_time_before_restart_s': 0.0,
|
|
'end_time_us': 100,
|
|
'entire_frame_compile_time_s': 0.0,
|
|
'fail_reason': None,
|
|
'fail_type': None,
|
|
'fail_user_frame_filename': None,
|
|
'fail_user_frame_lineno': None,
|
|
'frame_key': '1',
|
|
'gc_time_us': 0,
|
|
'graph_input_count': 1,
|
|
'graph_node_count': 3,
|
|
'graph_op_count': 1,
|
|
'guard_count': 9,
|
|
'has_guarded_code': True,
|
|
'inductor_code_gen_cumulative_compile_time_us': 0,
|
|
'inductor_compile_time_s': 0.0,
|
|
'inductor_config': None,
|
|
'inductor_cumulative_compile_time_us': 0,
|
|
'inductor_fx_remote_cache_backend_type': None,
|
|
'inductor_fx_remote_cache_hit_count': None,
|
|
'inductor_fx_remote_cache_hit_keys': None,
|
|
'inductor_fx_remote_cache_miss_count': None,
|
|
'inductor_fx_remote_cache_miss_keys': None,
|
|
'is_forward': True,
|
|
'is_runtime': False,
|
|
'joint_graph_pass_time_us': 0,
|
|
'log_format_version': 3,
|
|
'non_compliant_ops': set(),
|
|
'num_graph_breaks': 0,
|
|
'num_triton_bundles': None,
|
|
'pgo_get_remote_code_state_time_us': None,
|
|
'pgo_put_remote_code_state_time_us': None,
|
|
'post_grad_pass_time_us': 0,
|
|
'pre_grad_pass_time_us': 0,
|
|
'python_version': None,
|
|
'recompile_reason': None,
|
|
'remote_cache_time_saved_s': None,
|
|
'remote_cache_version': None,
|
|
'remote_fx_graph_cache_get_time_ms': None,
|
|
'remote_fx_graph_cache_get_time_us': None,
|
|
'remote_fx_graph_cache_put_time_ms': None,
|
|
'remote_fx_graph_cache_put_time_us': None,
|
|
'restart_reasons': set(),
|
|
'runtime_cudagraphify_time_us': None,
|
|
'runtime_triton_autotune_time_us': None,
|
|
'shape_env_guard_count': 0,
|
|
'specialize_float': False,
|
|
'start_time': 0.0001,
|
|
'start_time_us': 100,
|
|
'structured_logging_overhead_s': 0.0,
|
|
'structured_logging_overhead_us': 0,
|
|
'tensorify_float_attempt': None,
|
|
'tensorify_float_failure': None,
|
|
'tensorify_float_success': None,
|
|
'triton_compile_time_us': 0,
|
|
'triton_kernel_compile_times_us': None,
|
|
'triton_version': None}""", # noqa: B950
|
|
)
|
|
|
|
# Second event is for the backward
|
|
raw = dataclasses.asdict(compilation_events[1])
|
|
del raw["feature_usage"]
|
|
del raw["ir_count"]
|
|
del raw["guard_latency_us"]
|
|
del raw["param_numel"]
|
|
del raw["param_bytes"]
|
|
del raw["param_count"]
|
|
self.assertExpectedInline(
|
|
pprint.pformat(raw),
|
|
"""\
|
|
{'accumulated_cache_size': None,
|
|
'aot_autograd_cumulative_compile_time_us': None,
|
|
'backend_compile_time_s': None,
|
|
'backward_cumulative_compile_time_us': 0,
|
|
'cache_size': None,
|
|
'co_filename': None,
|
|
'co_firstlineno': None,
|
|
'co_name': None,
|
|
'code_gen_time_s': 0.0,
|
|
'compile_id': '1/0',
|
|
'compile_time_autotune_time_us': None,
|
|
'compliant_custom_ops': None,
|
|
'config_inline_inbuilt_nn_modules': None,
|
|
'config_suppress_errors': None,
|
|
'cuda_version': None,
|
|
'cudagraph_skip_reason': None,
|
|
'distributed_ephemeral_timeout_us': None,
|
|
'duration_us': 0,
|
|
'dynamo_compile_time_before_restart_us': None,
|
|
'dynamo_config': None,
|
|
'dynamo_cumulative_compile_time_us': None,
|
|
'dynamo_time_before_restart_s': None,
|
|
'end_time_us': 100,
|
|
'entire_frame_compile_time_s': None,
|
|
'fail_reason': None,
|
|
'fail_type': None,
|
|
'fail_user_frame_filename': None,
|
|
'fail_user_frame_lineno': None,
|
|
'frame_key': None,
|
|
'gc_time_us': None,
|
|
'graph_input_count': None,
|
|
'graph_node_count': None,
|
|
'graph_op_count': None,
|
|
'guard_count': None,
|
|
'has_guarded_code': None,
|
|
'inductor_code_gen_cumulative_compile_time_us': 0,
|
|
'inductor_compile_time_s': 0.0,
|
|
'inductor_config': None,
|
|
'inductor_cumulative_compile_time_us': 0,
|
|
'inductor_fx_remote_cache_backend_type': None,
|
|
'inductor_fx_remote_cache_hit_count': None,
|
|
'inductor_fx_remote_cache_hit_keys': None,
|
|
'inductor_fx_remote_cache_miss_count': None,
|
|
'inductor_fx_remote_cache_miss_keys': None,
|
|
'is_forward': False,
|
|
'is_runtime': False,
|
|
'joint_graph_pass_time_us': None,
|
|
'log_format_version': 3,
|
|
'non_compliant_ops': None,
|
|
'num_graph_breaks': 0,
|
|
'num_triton_bundles': None,
|
|
'pgo_get_remote_code_state_time_us': None,
|
|
'pgo_put_remote_code_state_time_us': None,
|
|
'post_grad_pass_time_us': 0,
|
|
'pre_grad_pass_time_us': None,
|
|
'python_version': None,
|
|
'recompile_reason': None,
|
|
'remote_cache_time_saved_s': None,
|
|
'remote_cache_version': None,
|
|
'remote_fx_graph_cache_get_time_ms': None,
|
|
'remote_fx_graph_cache_get_time_us': None,
|
|
'remote_fx_graph_cache_put_time_ms': None,
|
|
'remote_fx_graph_cache_put_time_us': None,
|
|
'restart_reasons': None,
|
|
'runtime_cudagraphify_time_us': None,
|
|
'runtime_triton_autotune_time_us': None,
|
|
'shape_env_guard_count': None,
|
|
'specialize_float': None,
|
|
'start_time': 0.0001,
|
|
'start_time_us': 100,
|
|
'structured_logging_overhead_s': 0.0,
|
|
'structured_logging_overhead_us': 0,
|
|
'tensorify_float_attempt': None,
|
|
'tensorify_float_failure': None,
|
|
'tensorify_float_success': None,
|
|
'triton_compile_time_us': 0,
|
|
'triton_kernel_compile_times_us': None,
|
|
'triton_version': None}""", # noqa: B950
|
|
)
|
|
|
|
@dynamo_config.patch(
|
|
{
|
|
"log_compilation_metrics": True,
|
|
}
|
|
)
|
|
def test_ir_count(self):
|
|
# Different python versions have different potential IR counts.
|
|
version = (sys.version_info[0], sys.version_info[1])
|
|
self.assertIn(version, ((3, 9), (3, 10), (3, 11), (3, 12), (3, 13)))
|
|
first, second = {
|
|
(3, 9): (10, 6),
|
|
(3, 10): (10, 6),
|
|
(3, 11): (10, 6),
|
|
(3, 12): (10, 6),
|
|
(3, 13): (11, 7),
|
|
}[version]
|
|
|
|
def test1(x):
|
|
y = x + x
|
|
z = y * y
|
|
return z
|
|
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
torch.compile(test1)(torch.randn(10, 10))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[0].ir_count, first)
|
|
|
|
def test2(x):
|
|
y = x + x
|
|
return y
|
|
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
torch.compile(test2)(torch.randn(10, 10))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[0].ir_count, second)
|
|
|
|
@dynamo_config.patch({"log_compilation_metrics": True})
|
|
@inductor_config.patch({"force_disable_caches": True})
|
|
def test_dynamic_shape_feature_use(self):
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
|
|
@torch.compile()
|
|
def f(x):
|
|
return x * x
|
|
|
|
f(torch.randn(4))
|
|
f(torch.randn(3))
|
|
compilation_events = [
|
|
arg[0][0].feature_usage for arg in log_event.call_args_list
|
|
]
|
|
self.assertIn(
|
|
("dynamo.automatic_dynamic_shapes", True), compilation_events[1].items()
|
|
)
|
|
|
|
compilation_events = []
|
|
with dynamo_config.patch({"automatic_dynamic_shapes": False}), mock.patch(
|
|
"torch._dynamo.utils.log_compilation_event"
|
|
) as log_event:
|
|
|
|
@torch.compile()
|
|
def f(x):
|
|
return x * x
|
|
|
|
f(torch.randn(4))
|
|
f(torch.randn(3))
|
|
compilation_events = [
|
|
arg[0][0].feature_usage for arg in log_event.call_args_list
|
|
]
|
|
self.assertIn(
|
|
("dynamo.automatic_dynamic_shapes", False), compilation_events[1].items()
|
|
)
|
|
|
|
@dynamo_config.patch({"log_compilation_metrics": True})
|
|
def test_num_params(self):
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
|
|
class ModelSimple(nn.Module):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.conv1 = nn.Conv2d(1, 20, 5)
|
|
|
|
def forward(self, x):
|
|
return F.relu(self.conv1(x))
|
|
|
|
self.assertEqual([x.numel() for x in ModelSimple().parameters()], [500, 20])
|
|
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
m = ModelSimple()
|
|
torch.compile(m)(torch.randn(1, 10, 10))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[0].param_numel, 520)
|
|
self.assertEqual(compilation_events[0].param_bytes, 4 * 520)
|
|
self.assertEqual(compilation_events[0].param_count, 2)
|
|
|
|
class ModelWrapped(nn.Module):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.m1 = ModelSimple()
|
|
self.m2 = ModelSimple()
|
|
|
|
def forward(self, x):
|
|
return self.m1(x) + self.m2(x)
|
|
|
|
compilation_events = []
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
m = ModelWrapped()
|
|
torch.compile(m)(torch.randn(1, 10, 10))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[0].param_numel, 1040)
|
|
self.assertEqual(compilation_events[0].param_bytes, 4 * 1040)
|
|
self.assertEqual(compilation_events[0].param_count, 4)
|
|
|
|
# Test a tied module
|
|
l1 = nn.Linear(4, 4)
|
|
l2 = nn.Linear(4, 4)
|
|
m = nn.Sequential(l1, nn.Sequential(l1, l2))
|
|
self.assertEqual([x.numel() for x in m.parameters()], [16, 4, 16, 4])
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
torch.compile(m)(torch.randn(4, 4))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[0].param_numel, 40)
|
|
self.assertEqual(compilation_events[0].param_bytes, 4 * 40)
|
|
self.assertEqual(compilation_events[0].param_count, 4)
|
|
|
|
# Test tied weights
|
|
l1 = nn.Linear(4, 4)
|
|
l2 = nn.Linear(4, 4)
|
|
l1.weight = l2.weight
|
|
m = nn.Sequential(l1, nn.Sequential(l2))
|
|
self.assertEqual([x.numel() for x in m.parameters()], [16, 4, 4])
|
|
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
|
|
torch.compile(m)(torch.randn(4, 4))
|
|
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
|
|
self.assertEqual(compilation_events[0].param_numel, 24)
|
|
self.assertEqual(compilation_events[0].param_bytes, 4 * 24)
|
|
self.assertEqual(compilation_events[0].param_count, 3)
|
|
|
|
|
|
class TestInductorConfigParsingForLogging(TestCase):
|
|
"""
|
|
Test for parsing inductor config for logging in CompilationMetrics.
|
|
"""
|
|
|
|
class TestObject:
|
|
def __init__(self, a, b):
|
|
self.a = a
|
|
self.b = b
|
|
|
|
def test_inductor_config_jsonify(self):
|
|
"""
|
|
Sanity check if the actual inductor config is parsed correctly
|
|
"""
|
|
|
|
inductor_config_json = utils._scrubbed_inductor_config_for_logging()
|
|
self.assertTrue(isinstance(inductor_config_json, str))
|
|
self.assertIn('trace"', inductor_config_json)
|
|
|
|
@mock.patch("torch._dynamo.utils.torch._inductor.config")
|
|
def test_inductor_config_parsing_non_conforming_items(self, mocked_inductor_config):
|
|
"""
|
|
Test if the inductor config is parsed correctly when the config is
|
|
- None
|
|
- not a dict
|
|
- not json serializable
|
|
- complex unserializable objects
|
|
"""
|
|
obj = TestCase
|
|
test_mock_config = {
|
|
"some": {"name": obj, "some": True},
|
|
"data": {"name": obj, "some": True},
|
|
"list": [
|
|
{"name": obj, "some": True},
|
|
{"name": obj, "some": True},
|
|
],
|
|
"object": {
|
|
"name": obj,
|
|
"some": True,
|
|
"data": {"name": obj, "some": True},
|
|
},
|
|
}
|
|
expected = (
|
|
"""{"data": {"name": "Value is not JSON serializable", "some": true}, """
|
|
""""list": [{"name": "Value is not JSON serializable", "some": true}, """
|
|
"""{"name": "Value is not JSON serializable", "some": true}], """
|
|
""""object": {"data": {"name": "Value is not JSON serializable", "some": true}, """
|
|
""""name": "Value is not JSON serializable", "some": true}, """
|
|
""""some": {"name": "Value is not JSON serializable", "some": true}}"""
|
|
)
|
|
mocked_inductor_config.get_config_copy.return_value = test_mock_config
|
|
inductor_config_json = utils._scrubbed_inductor_config_for_logging()
|
|
self.assertEqual(inductor_config_json, expected)
|
|
|
|
expected = "{}"
|
|
mocked_inductor_config.get_config_copy.return_value = {obj: obj}
|
|
inductor_config_json = utils._scrubbed_inductor_config_for_logging()
|
|
self.assertEqual(inductor_config_json, expected)
|
|
|
|
expected = "Inductor Config is not JSON serializable"
|
|
mocked_inductor_config.get_config_copy.return_value = obj
|
|
inductor_config_json = utils._scrubbed_inductor_config_for_logging()
|
|
self.assertEqual(inductor_config_json, expected)
|
|
|
|
expected = None
|
|
mocked_inductor_config.get_config_copy.return_value = None
|
|
inductor_config_json = utils._scrubbed_inductor_config_for_logging()
|
|
self.assertEqual(inductor_config_json, expected)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from torch._dynamo.test_case import run_tests
|
|
|
|
run_tests()
|