pytorch

mirror of https://github.com/zebrajr/pytorch.git synced 2025-12-06 12:20:52 +01:00

History

Shangdi Yu d2eff5d454 Add python stack trace to AOTI generated code (#160539 ) Summary: We add a thread_local KernelContext object so Strobelight (and other potential profilers) can read the stack trace information of the running kernel. This will bring extra overhead, so we guard this behind the `cpp.enable_kernel_profile` flag. Example output code: ```cpp #include <torch/csrc/inductor/aoti_runtime/kernel_context_tls.h> namespace torch::aot_inductor { thread_local KernelContext* tls_kernel_context = nullptr; } // Other code ..... void AOTInductorModel::run_impl( AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles // are stolen; the array itself is borrowed AtenTensorHandle* output_handles, // array for writing output AtenTensorHandle; handles // will be stolen by the caller; the array itself is // borrowed DeviceStreamType stream, AOTIProxyExecutorHandle proxy_executor ) { __check_inputs_outputs(input_handles, output_handles); auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 4); auto arg2_1 = std::move(inputs[0]); auto arg3_1 = std::move(inputs[1]); auto arg4_1 = std::move(inputs[2]); auto arg5_1 = std::move(inputs[3]); [[maybe_unused]] auto& fc1_weight = constants_->at(0); [[maybe_unused]] auto& fc1_bias = constants_->at(1); inputs.clear(); [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(this->kernels_.get()); static constexpr int64_t int_array_0[] = {8L, 16L}; static constexpr int64_t int_array_1[] = {16L, 1L}; AtenTensorHandle buf0_handle; AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf0_handle)); RAIIAtenTensorHandle buf0(buf0_handle); // Topologically Sorted Source Nodes: [linear], Original ATen: [aten.t, aten.addmm] // [Provenance debug handles] aoti_torch_cpu_addmm_out:1 static constexpr int64_t int_array_2[] = {10L, 16L}; static constexpr int64_t int_array_3[] = {1L, 10L}; { KernelContextGuard _ctx("aoti_torch_cpu_addmm_out", R"( File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 829, in forward x = self.fc1(x) File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/linear.py", line 134, in forward return F.linear(input, self.weight, self.bias) )"); RAIIAtenRecordFunctionHandle record_aoti_torch_cpu_addmm_out_("aoti_torch_cpu_addmm_out", nullptr); AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cpu_addmm_out(buf0, fc1_bias, arg2_1, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(fc1_weight, 2, int_array_2, int_array_3, 0L)), 1L, 1L)); } arg2_1.reset(); auto buf1 = std::move(buf0); // reuse static constexpr int64_t int_array_4[] = {10L, 20L}; static constexpr int64_t int_array_5[] = {20L, 1L}; AtenTensorHandle buf2_handle; AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf2_handle)); RAIIAtenTensorHandle buf2(buf2_handle); // [Provenance debug handles] cpp_fused_mul_relu_sigmoid_0:2 { KernelContextGuard _ctx("cpp_fused_mul_relu_sigmoid_0", R"( File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 831, in forward x = self.sigmoid(x) File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/activation.py", line 359, in forward return torch.sigmoid(input) File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 830, in forward x = self.relu(x) File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/activation.py", line 144, in forward return F.relu(input, inplace=self.inplace) File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 832, in forward d = a 3.14 )"); cpp_fused_mul_relu_sigmoid_0((float)(buf1.data_ptr()), (const float)(arg3_1.data_ptr()), (float)(buf2.data_ptr())); } arg3_1.reset(); static constexpr int64_t int_array_6[] = {10L, 30L}; static constexpr int64_t int_array_7[] = {30L, 1L}; AtenTensorHandle buf3_handle; AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_6, int_array_7, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf3_handle)); RAIIAtenTensorHandle buf3(buf3_handle); // Topologically Sorted Source Nodes: [mul, addmm], Original ATen: [aten.mul, aten.addmm] // [Provenance debug handles] aoti_torch_cpu_addmm_out:3 { KernelContextGuard _ctx("aoti_torch_cpu_addmm_out", R"( File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 833, in forward y = torch.addmm(c, d, b) )"); RAIIAtenRecordFunctionHandle record_aoti_torch_cpu_addmm_out_("aoti_torch_cpu_addmm_out", nullptr); AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cpu_addmm_out(buf3, arg5_1, buf2, arg4_1, 1L, 1L)); } arg4_1.reset(); arg5_1.reset(); buf2.reset(); auto buf4 = std::move(buf3); // reuse // [Provenance debug handles] cpp_fused_gelu_1:4 { KernelContextGuard _ctx("cpp_fused_gelu_1", R"( File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 834, in forward z = torch.nn.functional.gelu(y) )"); cpp_fused_gelu_1((float)(buf4.data_ptr())); } output_handles[0] = buf1.release(); output_handles[1] = buf4.release(); } // AOTInductorModel::run_impl ``` Test Plan: ``` buck run mode/dev-nosan fbcode//caffe2/test/inductor:provenance_tracing -- -r stack_traces ``` Rollback Plan: Differential Revision: D78436007 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160539 Approved by: https://github.com/yiming0416		2025-10-29 22:47:52 +00:00
..
_awaits
_C	[ROCm][CUDA] add unit test utility busy_wait_for_flag (#166218 )	2025-10-29 22:40:23 +00:00
_C_flatbuffer
_custom_op	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
_decomp	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_dispatch	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
_dynamo	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
_export	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
_functorch	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
_higher_order_ops	Revert "[1/N] Remove unused loop variables (#166258 )"	2025-10-29 11:10:37 +00:00
_inductor	Add python stack trace to AOTI generated code (#160539 )	2025-10-29 22:47:52 +00:00
_lazy	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
_library	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_logging	Clean up unused Pyrefly suppressions (#166178 )	2025-10-25 05:32:21 +00:00
_numpy	Enable PLW0127 in ruff (#165851 )	2025-10-21 03:30:57 +00:00
_prims	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_prims_common	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_refs	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
_strobelight	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
_subclasses	Revert "[1/N] Remove unused loop variables (#166258 )"	2025-10-29 11:10:37 +00:00
_vendor
accelerator	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
amp	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
ao	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
autograd	Revert "[1/N] Remove unused loop variables (#166258 )"	2025-10-29 11:10:37 +00:00
backends	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
compiler	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
contrib
cpu	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
csrc	Add python stack trace to AOTI generated code (#160539 )	2025-10-29 22:47:52 +00:00
cuda	[ROCm][CUDA] add unit test utility busy_wait_for_flag (#166218 )	2025-10-29 22:40:23 +00:00
distributed	[DTensor] Fix torch.all() using incorrect reduction operator (#165924 )	2025-10-29 20:58:35 +00:00
distributions	Fix pyrelfy ignore syntax in distributions and ao (#166248 )	2025-10-26 22:13:48 +00:00
export	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
fft
func
futures
fx	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
headeronly	Add TORCH_TARGET_VERSION for stable ABI (#164356 )	2025-10-29 15:41:28 +00:00
jit	Revert "[1/N] Remove unused loop variables (#166258 )"	2025-10-29 11:10:37 +00:00
legacy
lib
linalg
masked	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
monitor
mps
mtia	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
multiprocessing	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
nativert	[triton][nativert] Add num_cpu_threads for triton-cpu (#166255 )	2025-10-28 08:40:04 +00:00
nested	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
nn	Revert "bwd pass (#164504 )"	2025-10-29 15:10:40 +00:00
numa	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
onnx	[ONNX] Change stacklevel in warning message for export (#166558 )	2025-10-29 20:45:25 +00:00
optim	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
package	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
profiler	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
quantization	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
signal	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
sparse	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
special
testing	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
utils	Revert "[pytree] add `treespec_{leaf,tuple,dict}` functions for args_spec modification (#160843 )"	2025-10-29 22:46:48 +00:00
xpu	Introduce a new API torch.xpu.set_per_process_memory_fraction (#165510 )	2025-10-29 03:24:52 +00:00
__config__.py
__future__.py
__init__.py	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_appdirs.py
_classes.py
_compile.py
_custom_ops.py
_environment.py
_guards.py	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
_jit_internal.py	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_linalg_utils.py
_lobpcg.py	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_lowrank.py
_meta_registrations.py	[Inductor][Triton][FP8] Support deepseek-style scaling in Inductor (#164404 )	2025-10-28 03:38:54 +00:00
_namedtensor_internals.py
_ops.py	Fix pyrefly ignore syntax (#166438 )	2025-10-29 00:02:21 +00:00
_python_dispatcher.py
_size_docs.py
_sources.py
_storage_docs.py
_streambase.py
_tensor_docs.py
_tensor_str.py	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
_tensor.py	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
_thread_safe_fork.py
_torch_docs.py	Clarrifying input output angle unit in the docs for trigonometric fun… (#161248 )	2025-10-18 11:53:48 +00:00
_utils_internal.py	Fix pyrefly error syntax (2/n) (#166448 )	2025-10-29 00:36:40 +00:00
_utils.py	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
_VF.py
_vmap_internals.py
_weights_only_unpickler.py	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
CMakeLists.txt	[ROCm] Use a ROCm version string without hash. (#166336 )	2025-10-28 03:53:55 +00:00
custom_class_detail.h
custom_class.h
extension.h
functional.py	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
header_only_apis.txt	Move toUnderlying to headeronly (#165694 )	2025-10-22 05:31:16 +00:00
hub.py	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
library.h
library.py	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
overrides.py	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
py.typed
quasirandom.py
random.py	Fix flake8 B028 warnings (#166224 )	2025-10-26 06:18:55 +00:00
return_types.py
script.h
serialization.py	Fix syntax for pyrefly errors (#166496 )	2025-10-29 20:00:25 +00:00
storage.py	Fix pyrefly ignores 1/n (#166239 )	2025-10-26 00:44:10 +00:00
torch_version.py
types.py	Enable PLC0414 on ruff (#165828 )	2025-10-22 04:56:52 +00:00
version.py.tpl