Shangdi Yu
d2eff5d454
Add python stack trace to AOTI generated code ( #160539 )
...
Summary:
We add a thread_local KernelContext object so Strobelight (and other potential profilers) can read the stack trace information of the running kernel.
This will bring extra overhead, so we guard this behind the `cpp.enable_kernel_profile` flag.
Example output code:
```cpp
#include <torch/csrc/inductor/aoti_runtime/kernel_context_tls.h>
namespace torch::aot_inductor {
thread_local KernelContext* tls_kernel_context = nullptr;
}
// Other code .....
void AOTInductorModel::run_impl(
AtenTensorHandle*
input_handles, // array of input AtenTensorHandle; handles
// are stolen; the array itself is borrowed
AtenTensorHandle*
output_handles, // array for writing output AtenTensorHandle; handles
// will be stolen by the caller; the array itself is
// borrowed
DeviceStreamType stream,
AOTIProxyExecutorHandle proxy_executor
) {
__check_inputs_outputs(input_handles, output_handles);
auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 4);
auto arg2_1 = std::move(inputs[0]);
auto arg3_1 = std::move(inputs[1]);
auto arg4_1 = std::move(inputs[2]);
auto arg5_1 = std::move(inputs[3]);
[[maybe_unused]] auto& fc1_weight = constants_->at(0);
[[maybe_unused]] auto& fc1_bias = constants_->at(1);
inputs.clear();
[[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
static constexpr int64_t int_array_0[] = {8L, 16L};
static constexpr int64_t int_array_1[] = {16L, 1L};
AtenTensorHandle buf0_handle;
AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf0_handle));
RAIIAtenTensorHandle buf0(buf0_handle);
// Topologically Sorted Source Nodes: [linear], Original ATen: [aten.t, aten.addmm]
// [Provenance debug handles] aoti_torch_cpu_addmm_out:1
static constexpr int64_t int_array_2[] = {10L, 16L};
static constexpr int64_t int_array_3[] = {1L, 10L};
{
KernelContextGuard _ctx("aoti_torch_cpu_addmm_out", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 829, in forward
x = self.fc1(x)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/linear.py", line 134, in forward
return F.linear(input, self.weight, self.bias)
)");
RAIIAtenRecordFunctionHandle record_aoti_torch_cpu_addmm_out_("aoti_torch_cpu_addmm_out", nullptr);
AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cpu_addmm_out(buf0, fc1_bias, arg2_1, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(fc1_weight, 2, int_array_2, int_array_3, 0L)), 1L, 1L));
}
arg2_1.reset();
auto buf1 = std::move(buf0); // reuse
static constexpr int64_t int_array_4[] = {10L, 20L};
static constexpr int64_t int_array_5[] = {20L, 1L};
AtenTensorHandle buf2_handle;
AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf2_handle));
RAIIAtenTensorHandle buf2(buf2_handle);
// [Provenance debug handles] cpp_fused_mul_relu_sigmoid_0:2
{
KernelContextGuard _ctx("cpp_fused_mul_relu_sigmoid_0", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 831, in forward
x = self.sigmoid(x)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/activation.py", line 359, in forward
return torch.sigmoid(input)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 830, in forward
x = self.relu(x)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/torch/nn/modules/activation.py", line 144, in forward
return F.relu(input, inplace=self.inplace)
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 832, in forward
d = a * 3.14
)");
cpp_fused_mul_relu_sigmoid_0((float*)(buf1.data_ptr()), (const float*)(arg3_1.data_ptr()), (float*)(buf2.data_ptr()));
}
arg3_1.reset();
static constexpr int64_t int_array_6[] = {10L, 30L};
static constexpr int64_t int_array_7[] = {30L, 1L};
AtenTensorHandle buf3_handle;
AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_6, int_array_7, cached_torch_dtype_float32, cached_torch_device_type_cpu, this->device_idx_, &buf3_handle));
RAIIAtenTensorHandle buf3(buf3_handle);
// Topologically Sorted Source Nodes: [mul, addmm], Original ATen: [aten.mul, aten.addmm]
// [Provenance debug handles] aoti_torch_cpu_addmm_out:3
{
KernelContextGuard _ctx("aoti_torch_cpu_addmm_out", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 833, in forward
y = torch.addmm(c, d, b)
)");
RAIIAtenRecordFunctionHandle record_aoti_torch_cpu_addmm_out_("aoti_torch_cpu_addmm_out", nullptr);
AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cpu_addmm_out(buf3, arg5_1, buf2, arg4_1, 1L, 1L));
}
arg4_1.reset();
arg5_1.reset();
buf2.reset();
auto buf4 = std::move(buf3); // reuse
// [Provenance debug handles] cpp_fused_gelu_1:4
{
KernelContextGuard _ctx("cpp_fused_gelu_1", R"(
File "/data/users/shangdiy/fbsource/buck-out/v2/gen/fbcode/cba6f4fb5faa5f79/caffe2/test/inductor/__provenance_tracing__/provenance_tracing#link-tree/caffe2/test/inductor/test_provenance_tracing.py", line 834, in forward
z = torch.nn.functional.gelu(y)
)");
cpp_fused_gelu_1((float*)(buf4.data_ptr()));
}
output_handles[0] = buf1.release();
output_handles[1] = buf4.release();
} // AOTInductorModel::run_impl
```
Test Plan:
```
buck run mode/dev-nosan fbcode//caffe2/test/inductor:provenance_tracing -- -r stack_traces
```
Rollback Plan:
Differential Revision: D78436007
Pull Request resolved: https://github.com/pytorch/pytorch/pull/160539
Approved by: https://github.com/yiming0416
2025-10-29 22:47:52 +00:00
Camyll Harajli
59ddfb69a7
[cpu/gpu split] ( #165696 )
...
Summary: cpu/gpu split. cuda is default due to some downstream targets configurations.
Test Plan: test in CI
Differential Revision: D80712802
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165696
Approved by: https://github.com/jeffdaily , https://github.com/malfet , https://github.com/atalman
2025-10-29 21:44:52 +00:00
PyTorch MergeBot
1dd6b76914
Revert "[1/N] Remove unused loop variables ( #166258 )"
...
This reverts commit 76b2c37045 .
Reverted https://github.com/pytorch/pytorch/pull/166258 on behalf of https://github.com/atalman due to breaks test/distributed/test_serialization.py::TestSerialization::test_weights_only [GH job link](https://github.com/pytorch/pytorch/actions/runs/18894311802/job/53929321703 ) [HUD commit link](76b2c37045 ) ([comment](https://github.com/pytorch/pytorch/pull/166258#issuecomment-3460964612 ))
2025-10-29 11:10:37 +00:00
Yuanyuan Chen
8b188647cf
[2/N] Fix unused loop variables ( #166500 )
...
This PR removes unused loop variables.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166500
Approved by: https://github.com/mlazos
2025-10-29 08:30:35 +00:00
Sun, Jiayi
20be077085
[Inductor] support masked vectorization for the tail_loop for float64 datatype ( #163316 )
...
**Summary:**
Support masked vectorization for the tail_loop for float64 datatype.
**Example:**
```
import torch
def fn(x):
return x * x
x = torch.randn((22, 22), dtype=torch.double)
with torch.no_grad():
compiled_fn = torch.compile(fn)
compiled_fn(x)
```
**Generated code:**
- Before
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
auto tmp1 = double(tmp0 * tmp0);
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1;
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
- After
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316
Approved by: https://github.com/mingfeima , https://github.com/jansel
2025-10-29 03:30:38 +00:00
Maggie Moss
4fada51ada
Fix existing Pyrefly errors ( #166439 )
...
Trying to keep main as clean of type errors as possible until we are able to swtich to just one checker.
This adds suppressions for existing type errors on main.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166439
Approved by: https://github.com/Skylion007
2025-10-29 02:08:02 +00:00
Yuanyuan Chen
76b2c37045
[1/N] Remove unused loop variables ( #166258 )
...
This PR removes unused loop variables.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166258
Approved by: https://github.com/Lucaskabela , https://github.com/mlazos
2025-10-29 01:34:15 +00:00
Laith Sakka
adedf26e21
Support python slicing with tensor inputs. ( #165074 )
...
when the slice is tensor, we decompose it to .item() call and pass the unbacked symbol to the slice to avoid DDE.
the diff also fix an existing bug in codegen_dynamic_slice_size in the cpp wrapper. a +1 should be -1 making it match
python codegen.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165074
Approved by: https://github.com/Lucaskabela
2025-10-29 01:18:45 +00:00
Shunting Zhang
3895ce093f
[inductor] add in-kernel nan-check ( #166008 )
...
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166008
Approved by: https://github.com/eellison
2025-10-28 20:19:10 +00:00
PyTorch MergeBot
7379972cc0
Revert "[Inductor] Naive foreach autotune support ( #162053 )"
...
This reverts commit cdb60e44eb .
Reverted https://github.com/pytorch/pytorch/pull/162053 on behalf of https://github.com/xmfan due to Compile time regression ([comment](https://github.com/pytorch/pytorch/pull/162053#issuecomment-3458252331 ))
2025-10-28 20:01:54 +00:00
Shunting Zhang
5d0b3e28dc
[inductor] generate fused rms/layer norm bwd ( #165370 )
...
RMS/Layer norm backward would generated 2 kind of reductions:
- the reduction computing dx which reduce across the hidden dimension (in the context of transformer)
- the reduction computing dw/db which reduce across the BxT (batch size , sequence length) dimension.
These 2 set of reductions have common input buffers but inductor can not fuse them because of different loop orders.
There are multiple sources of custom kernels that implement fused version of such kernel (Liger-Kernel, quack, Paul Zhang's internal post). This PR enable Inductor to generate such kernels automatically.
The generated kernel is very similar to 33924d20b6/src/liger_kernel/ops/rms_norm.py (L114) .
To make the implementation simple and performing, we enable such fusion only if the inner reduction (computing dx) is a persistent reduction. This should be true for representative inputs. Persistent reduction is critical for the perf here to make sure a loaded tensor does not need to be reload.
To make sure the inner reduction (computing dx) and outer reductions (computing dw/db) being fusible, the PR does the following:
1. convert the outer reductions to pointwise by replacing 'reduction' & 'store_reduction' node with a new type of node 'parital_accumulate'. The new node will collect the reduction type, buffer name, input of reduction etc, which is essential for proper codegening.
2. do loop reordering (rely on the earlier loop ordering after fusion work) to reorder the loops of the converted pointwise so it can be fused with the inner reduction
3. there can be epilogues that need to be added in the end. E.g. the outer reduction may be followed by a division for mean , or followed by a down cast if dw/db is in low precision (fp16/bf16).
Some early benchmarking on H100 shows about 2X speedup for both RMSNorm and LayerNorm backward for shape (1152 * 500, 384 ) used in some internal model. Note that, I manually disable split reduction in this benchmarking since otherwise the fusion will be skipped right now. The next PR will make the mix-order-reduction compose better with split reduction
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165370
Approved by: https://github.com/jansel
ghstack dependencies: #166204
2025-10-28 05:53:52 +00:00
karthickai
1425b40f29
[inductor] Fix argmin/argmax returning incorrect indices for non-contiguous tensor ( #165983 )
...
Fixes #163929
Fixes argmin/argmax operations to return correct logical indices instead of physical memory offsets when applied to transposed/permuted tensors. When `argmin()` or `argmax()` is called on a transposed tensor, Inductor was returning physical memory indices instead of logical row-major indices. This caused incorrect results that don't match eager mode behavior.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165983
Approved by: https://github.com/shunting314
2025-10-28 01:23:24 +00:00
Millie Chen
9a91486e45
[Inductor-FX] Don't flatten constant args ( #166144 )
...
Summary: Fallback kernels are created with flattened constant args and an `unflatten` utility to unflatten them when needed. Apply it in FXConverter to preserve the original structure
Test Plan: added new CI tests
Differential Revision: D85347589
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166144
Approved by: https://github.com/blaine-rister
2025-10-27 22:33:37 +00:00
Shunting Zhang
a04edcb27a
[inductor] a few workspace api change ( #166204 )
...
A few workspace API changes:
1. return outer name when creating. Usually a use case does not care about outer name. But for mix-order-reduction (stacked PR), we need it to do the next-layer of reduction on the workspace tensor
2. be able to override workspace tensor dtype
3. be able to delay the deallocation of workspace tensors in TritonKernel.call_kernel since they may be used after the call. The lifetime of the workspace tensors are only enlarged a little bit. They would be deallocated once the next layer reduction is done.
Test with the stacked PR.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166204
Approved by: https://github.com/jansel
2025-10-27 18:10:23 +00:00
anwang
eb2bad5bb5
[Inductor] Make combo kernel MAX_NUM_ARGS configurable ( #166274 )
...
The MAX_NUM_ARGS of ComboKernel is currently a fixed number. We need to tune this number to avoid large fusion for MTIA, thus making it configurable.
Differential Revision: [D85509352](https://our.internmc.facebook.com/intern/diff/D85509352/ )
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166274
Approved by: https://github.com/eellison
2025-10-27 18:06:38 +00:00
Oguz Ulgen
8d4e48831e
Remove JITFunction constexpr and some arg_names ( #166280 )
...
https://github.com/triton-lang/triton/pull/8536 breaks torch.compile integration. This PR attempts to fix it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166280
Approved by: https://github.com/jansel
2025-10-27 09:29:03 +00:00
Maggie Moss
9940e894ea
Fix pyrefly ignore syntax in _inductor ( #166247 )
...
Ensures pyrefly ignores only ignore the intended error code.
pyrefly check
lintrunner
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166247
Approved by: https://github.com/oulgen
2025-10-27 02:48:42 +00:00
Jack Taylor
cdb60e44eb
[Inductor] Naive foreach autotune support ( #162053 )
...
Initial autotuning support for foreach kernels, 4x improvement for some kernels in internal workload. More improvements can surely be made here in the future. Removing num_warps for definition to enable autotune support in generated wrapper code.
Before:
triton_for_fused_18.kd 🔍 | 4.986 ms | 4.986 ms | 2.493 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.098 ms | 0.098 ms | 0.049 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.036 ms | 0.036 ms | 0.018 ms | 2 |
After:
triton_for_fused_18.kd 🔍 | 1.273 ms | 1.273 ms | 0.636 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.044 ms | 0.044 ms | 0.022 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.024 ms | 0.024 ms | 0.012 ms | 2 |
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162053
Approved by: https://github.com/mlazos , https://github.com/naromero77amd
Co-authored-by: Nichols A. Romero <nick.romero@amd.com>
2025-10-26 02:36:15 +00:00
Maggie Moss
c7eee49525
Fix pyrefly ignores 1/n ( #166239 )
...
First diff adjusting the syntax for pyrefly: ignore suppressions so they only hide one class of type error.
Test:
lintrunner
pyrefly check
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166239
Approved by: https://github.com/oulgen
2025-10-26 00:44:10 +00:00
Maggie Moss
eb83c3ca23
Clean up unused Pyrefly suppressions ( #166178 )
...
Cleaning up ignores that are no longer needed in the repo and adding select suppressions so the main branch is clean.
test plan:
`lintrunner -a`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166178
Approved by: https://github.com/oulgen
2025-10-25 05:32:21 +00:00
drisspg
cc20b7ad72
[FlexFlash] update names ( #166193 )
...
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166193
Approved by: https://github.com/BoyuanFeng
2025-10-25 00:07:11 +00:00
Blaine Burton Rister
0442125362
[Inductor] Restore original dtype for rank-0 CPU tensors ( #166118 )
...
# Problem
Inductor implicitly upcasts certain rank-0 kernel arguments from float16 to float32. Currently, this happens only on the `"cpu"` device, which appears to be related to float16 support in CPU Triton. However, it can also affect the behavior of GPU kernels, when a model contains tensors from multiple devices. Upcasting may be undesirable on some platforms, so users can typically disable it with the `config.triton.codegen_upcast_to_fp32` flag. However, this flag was not respected by the rank-0 kernel argument codepath.
Through an improbable series of events, float32 upcasting caused an internal model to fail compilation on MTIA. (Internal reviewers see T242444110.)
# Fix
If `config.triton.codegen_upcast_to_fp32` evaluates to `False`, cast the kernel argument to the original dtype.
# Test plan
Added a new CI test checking for the downcast iff the config flag is false. The test mixes GPU and CPU tensors to generate a GPU kernel with the implicit float32 upcast and explicit float16 downcast.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166118
Approved by: https://github.com/jfix71 , https://github.com/jansel , https://github.com/kundaMwiza
2025-10-24 19:59:25 +00:00
Ahmad Sarvmeily
13cda9b89e
Allow BlockDescriptorOptions classes to be overridden In TritonKernel ( #165899 )
...
By allowing the options classes (`BlockPtrOptions`/`TensorDescriptorOptions`) to be overridden in `TritonKernel`, subclasses with custom behaviour can be used in place of them, which provides greater flexibility.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165899
Approved by: https://github.com/jansel
2025-10-24 18:59:59 +00:00
PyTorch MergeBot
6c4412f72b
Revert "[Inductor] support masked vectorization for the tail_loop for float64 datatype ( #163316 )"
...
This reverts commit e9d8973427 .
Reverted https://github.com/pytorch/pytorch/pull/163316 on behalf of https://github.com/clee2000 due to seems to have broken some no_gpu tests? test/inductor/test_cpu_repro.py::CPUReproTests::test_double_reduction_vec [GH job link](https://github.com/pytorch/pytorch/actions/runs/18689033019/job/53290772740 ) [HUD commit link](e9d8973427 ) ([comment](https://github.com/pytorch/pytorch/pull/163316#issuecomment-3428210509 ))
2025-10-21 17:44:42 +00:00
PyTorch MergeBot
78bf6186f2
Revert "[Inductor] support masked vectorization for the tail_loop for fp8 datatype ( #163324 )"
...
This reverts commit e8cb34dd52 .
Reverted https://github.com/pytorch/pytorch/pull/163324 on behalf of https://github.com/clee2000 due to seems to have broken some no_gpu tests? test/inductor/test_cpu_repro.py::CPUReproTests::test_double_reduction_vec [GH job link](https://github.com/pytorch/pytorch/actions/runs/18689033019/job/53290772740 ) [HUD commit link](e9d8973427 ) ([comment](https://github.com/pytorch/pytorch/pull/163316#issuecomment-3428210509 ))
2025-10-21 17:44:42 +00:00
Nichols A. Romero
9f9ab881b2
[ROCm][inductor] heuristic improvements for reduction kernels ( #161280 )
...
Improvements to reduction kernel heuristics for MI350.
Contributions from several members of the AMD Inductor and Triton teams: @jataylo @iupaikov-amd @AmdSampsa @xiaohuguo2023
Pull Request resolved: https://github.com/pytorch/pytorch/pull/161280
Approved by: https://github.com/jansel , https://github.com/PaulZhang12 , https://github.com/eellison , https://github.com/jeffdaily
2025-10-21 07:48:54 +00:00
Blaine Burton Rister
f2bb22ff84
[Inductor-FX] Support Tensor.item ( #165599 )
...
# Feature
This PR supports compiling `Tensor.item` with Inductor's FX backend. This maps to a custom WrapperCodeGen method called `codegen_dynamic_scalar`.
# Implementation
The implementation is fairly mechanical, following the usual flow for these types of PRs.
1. Introduce a new Wrapper IR line for this, called `DynamicScalarLine`.
2. Split `PythonWrapperCodegen.codegen_dynamic_scalar` into 2 parts: a public method which generates the Wrapper IR line, and a private one generating Python from Wrapper IR.
3. Implement an FX codegen method for the wrapper IR line. This one calls `aten.where.Scalar` to handle code like `1 if x.item() else 0`, which is a bit tricky. It also calls `aten.item.default` to convert tensors to scalars.
# Test plan
Added CI tests mirroring the AOTI ones. They test float, int and bool types, the latter taking a distinct codegen path.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165599
Approved by: https://github.com/angelayi , https://github.com/jansel
2025-10-21 07:09:56 +00:00
PyTorch MergeBot
cf280ca1e8
Revert "[Inductor] Naive foreach autotune support ( #162053 )"
...
This reverts commit 779296a3fc .
Reverted https://github.com/pytorch/pytorch/pull/162053 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/162053#issuecomment-3423808492 ))
2025-10-20 21:36:44 +00:00
Jack Taylor
779296a3fc
[Inductor] Naive foreach autotune support ( #162053 )
...
Initial autotuning support for foreach kernels, 4x improvement for some kernels in internal workload. More improvements can surely be made here in the future. Removing num_warps for definition to enable autotune support in generated wrapper code.
Before:
triton_for_fused_18.kd 🔍 | 4.986 ms | 4.986 ms | 2.493 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.098 ms | 0.098 ms | 0.049 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.036 ms | 0.036 ms | 0.018 ms | 2 |
After:
triton_for_fused_18.kd 🔍 | 1.273 ms | 1.273 ms | 0.636 ms | 2 |
triton_for_fused_6.kd 🔍 | 0.044 ms | 0.044 ms | 0.022 ms | 2 |
triton_for_fused_7.kd 🔍 | 0.024 ms | 0.024 ms | 0.012 ms | 2 |
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162053
Approved by: https://github.com/mlazos , https://github.com/naromero77amd
2025-10-20 20:39:04 +00:00
PyTorch MergeBot
240c13394e
Revert "[inductor] require shape in TritonCSEVariable ( #162275 )"
...
This reverts commit 3af2f0c12a .
Reverted https://github.com/pytorch/pytorch/pull/162275 on behalf of https://github.com/clee2000 due to still failing due to the above D84932446 ([comment](https://github.com/pytorch/pytorch/pull/162275#issuecomment-3423153819 ))
2025-10-20 17:55:54 +00:00
Sun, Jiayi
e8cb34dd52
[Inductor] support masked vectorization for the tail_loop for fp8 datatype ( #163324 )
...
**Summary:**
Support masked vectorization for the tail_loop for fp8 datatype.
**Example:**
```
import torch
def fn(
x,
scale,
zero_point,
quant_min,
quant_max,
dtype,
):
x = torch.ops.quantized_decomposed.dequantize_per_tensor(
x,
scale,
zero_point,
quant_min,
quant_max,
dtype,
)
x = torch.relu(x)
x = torch.ops.quantized_decomposed.quantize_per_tensor(
x, scale, zero_point, quant_min, quant_max, dtype
)
return x
quant_min = -128
quant_max = 127
dtype = torch.float8_e4m3fn
x = torch.clamp(torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, quant_min, quant_max).to(dtype)
zero_point = 100
scale = 0.01
with torch.no_grad():
compiled_fn = torch.compile(fn)
compiled_fn(x, scale, zero_point, quant_min, quant_max, dtype)
```
**Generated code:**
- Before
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const at::Float8_e4m3fn* in_ptr0,
at::Float8_e4m3fn* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
{
auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = static_cast<float>(0.01);
auto tmp6 = at::vec::Vectorized<float>(tmp5);
auto tmp7 = tmp4 * tmp6;
auto tmp8 = (tmp7);
auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
auto tmp10 = tmp9 * tmp3;
auto tmp11 = tmp10.round();
auto tmp12 = tmp11 + tmp3;
auto tmp13 = static_cast<float>(-128.0);
auto tmp14 = at::vec::Vectorized<float>(tmp13);
auto tmp15 = at::vec::maximum(tmp12, tmp14);
auto tmp16 = static_cast<float>(127.0);
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = at::vec::minimum(tmp15, tmp17);
auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
{
for (int64_t x0_tail = static_cast<int64_t>(432L);x0_tail < static_cast<int64_t>(441L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
auto tmp1 = c10::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = float(tmp1 - tmp2);
auto tmp4 = static_cast<float>(0.01);
auto tmp5 = float(tmp3 * tmp4);
auto tmp6 = c10::convert<float>(tmp5);
auto tmp7 = std::max(tmp6, decltype(tmp6)(0));
auto tmp8 = float(tmp7 * tmp2);
auto tmp9 = std::nearbyint(tmp8);
auto tmp10 = float(tmp9 + tmp2);
auto tmp11 = static_cast<float>(-128.0);
auto tmp12 = max_propagate_nan(tmp10, tmp11);
auto tmp13 = static_cast<float>(127.0);
auto tmp14 = min_propagate_nan(tmp12, tmp13);
auto tmp15 = c10::convert<at::Float8_e4m3fn>(tmp14);
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp15;
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
# [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
- After
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const at::Float8_e4m3fn* in_ptr0,
at::Float8_e4m3fn* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
{
auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = static_cast<float>(0.01);
auto tmp6 = at::vec::Vectorized<float>(tmp5);
auto tmp7 = tmp4 * tmp6;
auto tmp8 = (tmp7);
auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
auto tmp10 = tmp9 * tmp3;
auto tmp11 = tmp10.round();
auto tmp12 = tmp11 + tmp3;
auto tmp13 = static_cast<float>(-128.0);
auto tmp14 = at::vec::Vectorized<float>(tmp13);
auto tmp15 = at::vec::maximum(tmp12, tmp14);
auto tmp16 = static_cast<float>(127.0);
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = at::vec::minimum(tmp15, tmp17);
auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
{
auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = static_cast<float>(0.01);
auto tmp6 = at::vec::Vectorized<float>(tmp5);
auto tmp7 = tmp4 * tmp6;
auto tmp8 = (tmp7);
auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
auto tmp10 = tmp9 * tmp3;
auto tmp11 = tmp10.round();
auto tmp12 = tmp11 + tmp3;
auto tmp13 = static_cast<float>(-128.0);
auto tmp14 = at::vec::Vectorized<float>(tmp13);
auto tmp15 = at::vec::maximum(tmp12, tmp14);
auto tmp16 = static_cast<float>(127.0);
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = at::vec::minimum(tmp15, tmp17);
auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
# [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163324
Approved by: https://github.com/Xia-Weiwen , https://github.com/mingfeima , https://github.com/jansel
ghstack dependencies: #163316
2025-10-20 01:56:00 +00:00
Sun, Jiayi
e9d8973427
[Inductor] support masked vectorization for the tail_loop for float64 datatype ( #163316 )
...
**Summary:**
Support masked vectorization for the tail_loop for float64 datatype.
**Example:**
```
import torch
def fn(x):
return x * x
x = torch.randn((22, 22), dtype=torch.double)
with torch.no_grad():
compiled_fn = torch.compile(fn)
compiled_fn(x)
```
**Generated code:**
- Before
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
auto tmp1 = double(tmp0 * tmp0);
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1;
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
- After
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316
Approved by: https://github.com/mingfeima , https://github.com/jansel
2025-10-20 01:41:38 +00:00
Yuanyuan Chen
3255e7872b
Enable all flake8-logging-format rules ( #164655 )
...
These rules are enabled by removing existing suppressions.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164655
Approved by: https://github.com/janeyx99 , https://github.com/mlazos
2025-10-19 00:59:28 +00:00
Maggie Moss
f02e3947f6
Expand type checking to mypy strict files ( #165697 )
...
Expands Pyrefly type checking to check the files outlined in the mypy-strict.ini configuration file:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165697
Approved by: https://github.com/ezyang
2025-10-18 04:34:45 +00:00
drisspg
de3da77cf7
Thread deterministic config vars to subproc compilation ( #165729 )
...
# Summary
TIL (AFTER WAYYYY TOO MUCH INSANITY), that we do not serialize the full set of configs for the subproc compilation.
I found this while working on Flex-attention determinism: https://github.com/meta-pytorch/attention-gym/pull/168
might be good to audit if we need to thread through any more
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165729
Approved by: https://github.com/shunting314 , https://github.com/eellison
2025-10-18 01:25:50 +00:00
PyTorch MergeBot
2928c5c572
Revert "Pyrefly suppressions 2 ( #165692 )"
...
This reverts commit 43d78423ac .
Reverted https://github.com/pytorch/pytorch/pull/165692 on behalf of https://github.com/seemethere due to This is causing merge conflicts when attempting to land internally, see D84890919 for more details ([comment](https://github.com/pytorch/pytorch/pull/165692#issuecomment-3416397240 ))
2025-10-17 17:13:04 +00:00
Isuru Fernando
3af2f0c12a
[inductor] require shape in TritonCSEVariable ( #162275 )
...
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162275
Approved by: https://github.com/mlazos
ghstack dependencies: #164158
2025-10-17 14:47:45 +00:00
Yuanyuan Chen
e925dfcc6b
Enable all SIM rules except disabled ones ( #164645 )
...
`SIM` rules are useful for simplifying boolean expressions and enhances code readability.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164645
Approved by: https://github.com/ezyang , https://github.com/mlazos
2025-10-17 07:27:11 +00:00
Maggie Moss
43d78423ac
Pyrefly suppressions 2 ( #165692 )
...
This is the last directory to opt in for the regular mypy.ini file. Will put up a diff to remove unused ignores before making sure we're also type checking all the files in the mypy strict configurations
Test plan:
dmypy restart && python3 scripts/lintrunner.py -a
pyrefly check
step 1: delete lines in the pyrefly.toml file from the project-excludes field
step 2: run pyrefly check
step 3: add suppressions, clean up unused suppressions
before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199
after:
INFO 0 errors (6,884 ignored)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165692
Approved by: https://github.com/oulgen
2025-10-17 04:15:25 +00:00
eellison
861cdb887b
use statically_known_leq & *=2 instead of bound_sympy in persistent rblock ( #165657 )
...
While these should be equivalent, we've found instances where they are not, and an error was caused. update until we figure out underlying issue.
Differential Revision: [D84835898](https://our.internmc.facebook.com/intern/diff/D84835898 )
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165657
Approved by: https://github.com/bobrenjc93
2025-10-17 02:48:03 +00:00
Mu-Chu Lee
9fccbdd4f0
Fix incorrect function signature in template ( #165567 )
...
Summary:
In https://github.com/pytorch/pytorch/pull/148305 we refactored the grid
argument out, but it's not reflected in our template.
Test Plan:
Included in commit.
python test/inductor/test_aot_inductor.py
AOTInductorTestABICompatibleGpu.test_cond_symint_input_disable_one_pass_cuda
Reviewers:
Subscribers:
Tasks:
Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165567
Approved by: https://github.com/desertfire
2025-10-17 02:40:56 +00:00
Maggie Moss
5641de7b6b
Add suppressions for _inductor/codegen ( #165659 )
...
Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283
Test plan:
dmypy restart && python3 scripts/lintrunner.py -a
pyrefly check
step 1: delete lines in the pyrefly.toml file from the project-excludes field
step 2: run pyrefly check
step 3: add suppressions, clean up unused suppressions
before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199
after:
INFO 0 errors (6,884 ignored)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165659
Approved by: https://github.com/oulgen
2025-10-16 21:37:37 +00:00
PyTorch MergeBot
fb06e49ce8
Revert "[inductor] print 0.0 as 0 for triton ( #164291 )"
...
This reverts commit 99b32a6750 .
Reverted https://github.com/pytorch/pytorch/pull/164291 on behalf of https://github.com/malfet due to Broke slow job, see aba8c43594/1 ([comment](https://github.com/pytorch/pytorch/pull/164291#issuecomment-3412768915 ))
2025-10-16 20:44:29 +00:00
Isuru Fernando
99b32a6750
[inductor] print 0.0 as 0 for triton ( #164291 )
...
Fixes https://github.com/pytorch/pytorch/issues/164157
Fixes https://github.com/pytorch/pytorch/issues/164086
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164291
Approved by: https://github.com/bobrenjc93
2025-10-16 16:37:50 +00:00
Nan Zhang
00afa06800
Add cse for make_block_ptr in Triton codegen ( #163399 )
...
Summary: per title
Test Plan: added test cases
Differential Revision: D82648215
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163399
Approved by: https://github.com/jansel , https://github.com/njriasan
2025-10-16 05:29:48 +00:00
xinan.lin
e5a9c247bc
[Fix XPU CI] [Inductor UT] Fix test cases broken by community. ( #165406 )
...
Fixes #163159 , Fixes #164098 , Fixes #164097 , Fixes #164099 , Fixes #165025
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165406
Approved by: https://github.com/EikanWang , https://github.com/jansel
2025-10-16 00:53:32 +00:00
PaulZhang12
901bbcba12
Gate division bitwise numerics under a flag ( #165566 )
...
https://github.com/pytorch/pytorch/pull/164144 ensures that division for compile is bitwise equivalent with eager. However, in https://github.com/pytorch/pytorch/issues/164301 , the kernel performance is regressed.
On B200:
With standard triton `/`:
6511 GB/s
With triton `div_rn`:
4692 GB/s
Further investigation is required for the generated PTX to see why there is such a large slowdown. For now, enable bitwise equivalent results under `TORCHINDUCTOR_EMULATE_DIVISION_ROUNDING` similar to emulate_precision_cast
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165566
Approved by: https://github.com/ngimel , https://github.com/eellison
2025-10-15 23:41:01 +00:00
Boyuan Feng
f071f17911
[Graph Partition] fix partition x memory plan issue ( #165514 )
...
For `test_graph_partition_with_memory_plan_reuse`, before this PR, when using graph partition, it would error ([P1992728479](https://www.internalfb.com/phabricator/paste/view/P1992728479 )):
```
def partition_0(args):
...
del buf0
return (buf3, buf4, buf5, buf2, primals_4, )
...
File "/tmp/torchinductor_boyuan/ww/cwwc7ukfqscg2vy6ankby2fizdb377tvgyx3fwdgddrxe3g47jg6.py", line 132, in partition_0
return (buf3, buf4, buf5, buf2, primals_4, )
^^^^
NameError: name 'buf2' is not defined. Did you mean: 'buf0'?
```
When not using graph partition, it would work and give the following code ([P1992997521](https://www.internalfb.com/phabricator/paste/view/P1992997521 )):
```
def call(self, args):
...
buf2 = buf0; del buf0 # reuse
...
```
Note that the issue is buf0 is not reused for buf2 when using graph partition.
Why? Because the codegen runs `run_wrapper_ir_passes` and `memory_plan_reuse`, which pops tailing `MemoryPlanningLine` unless it is in graph output by checking `V.graph.get_output_names()`. However, for graph partition, we should check the output of the current partition instead of the graph before partition.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165514
Approved by: https://github.com/ProExpertProg , https://github.com/eellison
2025-10-15 21:52:16 +00:00
PyTorch MergeBot
84d141e910
Revert "[inductor] Expand use of generic benchmark function ( #164938 )"
...
This reverts commit 5c583e2573 .
Reverted https://github.com/pytorch/pytorch/pull/164938 on behalf of https://github.com/clee2000 due to I think this broke test/inductor/test_cuda_repro.py::CudaReproTests::test_epilogue_fusion_with_view? [GH job link](https://github.com/pytorch/pytorch/actions/runs/18529735968/job/52813191763 ) [HUD commit link](f58f301313 ) on both rocm and the slow grad check for linux. It did run successfully on cuda workflow on trunk, I wonder if this a gpu capability thing? no clue though ([comment](https://github.com/pytorch/pytorch/pull/164938#issuecomment-3407600224 ))
2025-10-15 17:48:38 +00:00
Mwiza Kunda
5c583e2573
[inductor] Expand use of generic benchmark function ( #164938 )
...
Use the more generic `Benchmarker.benchmark` function to allow benchmarking other devices that support the required functionality, for example prologue and epilogue fusion can be benchmarked for triton CPU.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164938
Approved by: https://github.com/nmacchioni , https://github.com/eellison
2025-10-15 09:18:24 +00:00