mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
6c4412f72b
314 Commits
| Author | SHA1 | Message | Date | |
|---|---|---|---|---|
|
|
6c4412f72b |
Revert "[Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316)"
This reverts commit |
||
|
|
78bf6186f2 |
Revert "[Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324)"
This reverts commit |
||
|
|
e8cb34dd52 |
[Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324)
**Summary:**
Support masked vectorization for the tail_loop for fp8 datatype.
**Example:**
```
import torch
def fn(
x,
scale,
zero_point,
quant_min,
quant_max,
dtype,
):
x = torch.ops.quantized_decomposed.dequantize_per_tensor(
x,
scale,
zero_point,
quant_min,
quant_max,
dtype,
)
x = torch.relu(x)
x = torch.ops.quantized_decomposed.quantize_per_tensor(
x, scale, zero_point, quant_min, quant_max, dtype
)
return x
quant_min = -128
quant_max = 127
dtype = torch.float8_e4m3fn
x = torch.clamp(torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, quant_min, quant_max).to(dtype)
zero_point = 100
scale = 0.01
with torch.no_grad():
compiled_fn = torch.compile(fn)
compiled_fn(x, scale, zero_point, quant_min, quant_max, dtype)
```
**Generated code:**
- Before
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const at::Float8_e4m3fn* in_ptr0,
at::Float8_e4m3fn* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
{
auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = static_cast<float>(0.01);
auto tmp6 = at::vec::Vectorized<float>(tmp5);
auto tmp7 = tmp4 * tmp6;
auto tmp8 = (tmp7);
auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
auto tmp10 = tmp9 * tmp3;
auto tmp11 = tmp10.round();
auto tmp12 = tmp11 + tmp3;
auto tmp13 = static_cast<float>(-128.0);
auto tmp14 = at::vec::Vectorized<float>(tmp13);
auto tmp15 = at::vec::maximum(tmp12, tmp14);
auto tmp16 = static_cast<float>(127.0);
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = at::vec::minimum(tmp15, tmp17);
auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
{
for (int64_t x0_tail = static_cast<int64_t>(432L);x0_tail < static_cast<int64_t>(441L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
auto tmp1 = c10::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = float(tmp1 - tmp2);
auto tmp4 = static_cast<float>(0.01);
auto tmp5 = float(tmp3 * tmp4);
auto tmp6 = c10::convert<float>(tmp5);
auto tmp7 = std::max(tmp6, decltype(tmp6)(0));
auto tmp8 = float(tmp7 * tmp2);
auto tmp9 = std::nearbyint(tmp8);
auto tmp10 = float(tmp9 + tmp2);
auto tmp11 = static_cast<float>(-128.0);
auto tmp12 = max_propagate_nan(tmp10, tmp11);
auto tmp13 = static_cast<float>(127.0);
auto tmp14 = min_propagate_nan(tmp12, tmp13);
auto tmp15 = c10::convert<at::Float8_e4m3fn>(tmp14);
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp15;
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
# [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
- After
```
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn*', 'at::Float8_e4m3fn*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const at::Float8_e4m3fn* in_ptr0,
at::Float8_e4m3fn* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L)))
{
auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = static_cast<float>(0.01);
auto tmp6 = at::vec::Vectorized<float>(tmp5);
auto tmp7 = tmp4 * tmp6;
auto tmp8 = (tmp7);
auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
auto tmp10 = tmp9 * tmp3;
auto tmp11 = tmp10.round();
auto tmp12 = tmp11 + tmp3;
auto tmp13 = static_cast<float>(-128.0);
auto tmp14 = at::vec::Vectorized<float>(tmp13);
auto tmp15 = at::vec::maximum(tmp12, tmp14);
auto tmp16 = static_cast<float>(127.0);
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = at::vec::minimum(tmp15, tmp17);
auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L)))
{
auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
auto tmp1 = at::vec::convert<float>(tmp0);
auto tmp2 = static_cast<float>(100.0);
auto tmp3 = at::vec::Vectorized<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp5 = static_cast<float>(0.01);
auto tmp6 = at::vec::Vectorized<float>(tmp5);
auto tmp7 = tmp4 * tmp6;
auto tmp8 = (tmp7);
auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0));
auto tmp10 = tmp9 * tmp3;
auto tmp11 = tmp10.round();
auto tmp12 = tmp11 + tmp3;
auto tmp13 = static_cast<float>(-128.0);
auto tmp14 = at::vec::Vectorized<float>(tmp13);
auto tmp15 = at::vec::maximum(tmp12, tmp14);
auto tmp16 = static_cast<float>(127.0);
auto tmp17 = at::vec::Vectorized<float>(tmp16);
auto tmp18 = at::vec::minimum(tmp15, tmp17);
auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18);
tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1))
buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn)
# [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1
cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163324
Approved by: https://github.com/Xia-Weiwen, https://github.com/mingfeima, https://github.com/jansel
ghstack dependencies: #163316
|
||
|
|
e9d8973427 |
[Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316)
**Summary:**
Support masked vectorization for the tail_loop for float64 datatype.
**Example:**
```
import torch
def fn(x):
return x * x
x = torch.randn((22, 22), dtype=torch.double)
with torch.no_grad():
compiled_fn = torch.compile(fn)
compiled_fn(x)
```
**Generated code:**
- Before
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
auto tmp1 = double(tmp0 * tmp0);
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1;
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
- After
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316
Approved by: https://github.com/mingfeima, https://github.com/jansel
|
||
|
|
a8c528c105 |
[1/N] Apply UP035 rule in tests (#163947)
Apply UP035 `ruff` rule in tests, but some tests for `fx` and `dynamo` are excluded in case the old typing is the test target. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163947 Approved by: https://github.com/ezyang |
||
|
|
c106ee8515 |
[FakeTensor] Supplement the relevant logic for converting conv1d to conv2d in meta_conv (#160408)
## Fixes https://github.com/pytorch/pytorch/issues/159462 also fixes #163569 , #163604 ## summary the issue is caused by the wrong stride of conv1d's result generated by meta_conv: |
||
|
|
3e1b1a30f2 |
Revert "[inductor] Fix issue with scalar arg handling" (#163737)
This reverts commit a8cd437183142e17ba6fc8d7b5e9dcee462d7904. See https://github.com/pytorch/pytorch/pull/163481#issuecomment-3326310774 This PR might also cause issues with cudagraphs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163737 Approved by: https://github.com/ezyang ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481, #163520, #163482 |
||
|
|
d746b987d8 |
[inductor] Fix divmod error in decomp (#163482)
Fixes #163457 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163482 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481, #163520 |
||
|
|
ca512af3e7 |
[inductor] Fix issue with scalar arg handling (#163481)
Fixes #163420 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163481 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422 |
||
|
|
a1bd9248eb |
[inductor] Fallback on strided complex add (#163387)
Fixes #163243 Fixes #162561 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163387 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398 |
||
|
|
8225a26835 |
[dynamo] Fix issue with namedtuple slicing (#163351)
Fixes #163253 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163351 Approved by: https://github.com/williamwen42, https://github.com/mlazos |
||
|
|
23b033452f |
[Inductor][CPP] Fix layout for local buf in outer loop fusion (#160857)
Fixes #159154 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160857 Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel |
||
|
|
de744ca4b1 |
[Inductor] modify convert_to_reinterpret_view (#158914)
**Summary:** Fix https://github.com/pytorch/pytorch/issues/159121, Modify the rules for freezing the layout of `x.unwrap_view()` in `convert_to_reinterpret_view`: relax the condition of `isinstance(x_unwrap_view, (ReinterpretView, Buffer))` to `isinstance(x_unwrap_view, (ReinterpretView, Buffer, MutableBox))`. Prefer channels last format according to how the format of `x_unwrap_view_fx_node` is set from eager. **Example:** ``` import torch import torch.nn as nn class M(nn.Module): def __init__(self): super(M, self).__init__() self.relu = torch.nn.ReLU() def forward(self, x): n, c, h, w = x.shape return self.relu(x).permute(0, 2, 3, 1).reshape( n, h * w, c ) model = M().eval() x = torch.randn(2, 32, 4, 4).to(memory_format=torch.channels_last) compiled_model = torch.compile(model) with torch.no_grad(): compiled_model(x) ``` **Generated code:** - before ``` cpp_fused_permute_relu_view_0 = async_compile.cpp_pybinding(['const float*', 'float*', 'float*'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const float* in_ptr0, float* out_ptr0, float* out_ptr1) { { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(32L); x1+=static_cast<int64_t>(16L)) { for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(16L); x2+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(32L) && x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(16L))) { alignas(std::max(std::size_t(16), alignof(float))) float tmp0[16*16]; transpose_mxn<float,static_cast<int64_t>(16),static_cast<int64_t>(16),false>(in_ptr0 + static_cast<int64_t>(x1 + 32L*x2 + 512L*x0), static_cast<int64_t>(32L), tmp0, static_cast<int64_t>(16)); for (long x1_inner = 0; x1_inner < static_cast<int64_t>(16); x1_inner++) { auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + static_cast<int64_t>(16L*x1_inner), static_cast<int64_t>(16)); auto tmp2 = at::vec::clamp_min(tmp1, decltype(tmp1)(0)); tmp2.store(out_ptr0 + static_cast<int64_t>(x2 + 16L*x1 + 16L*x1_inner + 512L*x0)); } } } } } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(16L)) { for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L) && x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(32L))) { alignas(std::max(std::size_t(16), alignof(float))) float tmp0[16*16]; transpose_mxn<float,static_cast<int64_t>(16),static_cast<int64_t>(16),false>(out_ptr0 + static_cast<int64_t>(x1 + 16L*x2 + 512L*x0), static_cast<int64_t>(16L), tmp0, static_cast<int64_t>(16)); for (long x1_inner = 0; x1_inner < static_cast<int64_t>(16); x1_inner++) { auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + static_cast<int64_t>(16L*x1_inner), static_cast<int64_t>(16)); tmp1.store(out_ptr1 + static_cast<int64_t>(x2 + 32L*x1 + 32L*x1_inner + 512L*x0)); } } } } } } } } ''') async_compile.wait(globals()) del async_compile def call(args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (2, 32, 4, 4), (512, 1, 128, 32)) buf0 = empty_strided_cpu((2, 32, 4, 4), (512, 16, 4, 1), torch.float32) buf1 = empty_strided_cpu((2, 16, 32), (512, 32, 1), torch.float32) cpp_fused_permute_relu_view_0(arg0_1, buf0, buf1) del arg0_1 return (buf1, ) ``` - After ``` cpp_fused_relu_0 = async_compile.cpp_pybinding(['const float*', 'float*'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const float* in_ptr0, float* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(1024L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); auto tmp1 = at::vec::clamp_min(tmp0, decltype(tmp0)(0)); tmp1.store(out_ptr0 + static_cast<int64_t>(x0)); } } } } } ''') async_compile.wait(globals()) del async_compile def call(args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (2, 32, 4, 4), (512, 1, 128, 32)) buf0 = empty_strided_cpu((2, 32, 4, 4), (512, 1, 128, 32), torch.float32) cpp_fused_relu_0(arg0_1, buf0) del arg0_1 return (reinterpret_tensor(buf0, (2, 16, 32), (512, 32, 1), 0), ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158914 Approved by: https://github.com/CaoE, https://github.com/jansel |
||
|
|
24f43d0da7 |
[inductor] [cpu] fix the dype hardcoded to int64 in store_reduction (#157904)
## Fixes https://github.com/pytorch/pytorch/issues/157683
## mini repro
* Just copy the code from the issue to reproduce it.
```python
import torch
device = "cpu"
# Input tensors
v2_0 = torch.randn(16, 24, 59, dtype=torch.complex64, device=device)
v3_0 = torch.randn(16, 24, 59, dtype=torch.complex64, device=device)
def my_model(v2_0, v3_0):
v6_0 = -v3_0
v4_0 = v2_0 * v3_0
v1_0 = v4_0.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
v0_0 = v2_0.to(torch.int32)
v5_0 = v0_0.amax(dim=0)
return v6_0, v4_0, v1_0, v0_0, v5_0
v6_0, v4_0, v1_0, v0_0, v5_0 = my_model(v2_0, v3_0)
print("v6_0", v6_0.shape)
print("v4_0", v4_0.shape)
compiled_model = torch.compile(my_model, backend="inductor")
v6_0, v4_0, v1_0, v0_0, v5_0 = compiled_model(v2_0, v3_0)
print("v6_0", v6_0.shape)
print("v4_0", v4_0.shape)
print("v1_0", v1_0.shape)
print("v0_0", v0_0.shape)
print("v5_0", v5_0.shape)
```
error_stack
```
/home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注:candidate: ‘template<class dst_t, class src_t> std::enable_if_t<(! is_same_v<dst_t, src_t>), at::vec::CPU_CAPABILITY::Vectorized<T> > at::vec::CPU_CAPABILITY::convert(const at::vec::CPU_CAPABILITY::Vectorized<T>&)’
41 | convert(const Vectorized<src_t>& src) {
| ^~~~~~~
/home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注: template argument deduction/substitution failed:
/tmp/torchinductor_admin/6k/c6kr65o43rlmp2cmkpn5ezewhe5bla4w72hpcrg5biyelrs4skyw.main.cpp:37:99: 错误:模板参数数目不对(不应是 4 个而应是 2 个)
37 | auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec);
```
## summary
**The C++ kernel generated by the Inductor had the wrong data type for the output variable; it should be int32_t instead of int64_t. This incorrect data type led to an incompatible data type conversion, which caused the g++ compilation to fail.**
The original code that caused the problem.
```
def my_model(v2_0, v3_0):
v6_0 = -v3_0
v4_0 = v2_0 * v3_0
v1_0 = v4_0.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
v0_0 = v2_0.to(torch.int32)
// The original code that caused the problem.
v5_0 = v0_0.amax(dim=0)
```
## proof procedure
The c++ kernel generated by inductor:
```c++
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const int32_t* in_ptr0,
int32_t* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1416L); x0+=static_cast<int64_t>(16L))
{
{
int32_t tmp_acc0_arr[16];
for (int i = 0; i < 16; i++)
{
tmp_acc0_arr[i] = std::numeric_limits<int32_t>::min();
}
int32_t tmp_acc0 = std::numeric_limits<int32_t>::min();
at::vec::Vectorized<int32_t> tmp_acc0_vec = at::vec::Vectorized<int32_t>(std::numeric_limits<int32_t>::min());
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(1L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(1408L)))
{
auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr0 + static_cast<int64_t>(x0 + 1416L*x1), static_cast<int64_t>(16));
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(1408L) && x0 < static_cast<int64_t>(1416L)))
{
for (int64_t x0_tail = static_cast<int64_t>(1408L);x0_tail < static_cast<int64_t>(1416L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail + 1416L*x1)];
tmp_acc0_arr[x0_tail - static_cast<int64_t>(1408L)] = max_propagate_nan(tmp_acc0_arr[x0_tail - static_cast<int64_t>(1408L)], tmp0);
}
}
}
}
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(1408L)))
{
// impossible data type conversion which would caused the g++ compilation to fail.
auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec);
int32_t_tmp_acc0_vec.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(1408L) && x0 < static_cast<int64_t>(1416L)))
{
for (int64_t x0_tail = static_cast<int64_t>(1408L);x0_tail < static_cast<int64_t>(1416L); x0_tail++)
{
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp_acc0_arr[x0_tail - static_cast<int64_t>(1408L)];
}
}
}
}
}
}
```
the compilers complains
```text
/home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注:candidate: ‘template<class dst_t, class src_t> std::enable_if_t<(! is_same_v<dst_t, src_t>), at::vec::CPU_CAPABILITY::Vectorized<T> > at::vec::CPU_CAPABILITY::convert(const at::vec::CPU_CAPABILITY::Vectorized<T>&)’
41 | convert(const Vectorized<src_t>& src) {
| ^~~~~~~
/home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注: template argument deduction/substitution failed:
/tmp/torchinductor_admin/6k/c6kr65o43rlmp2cmkpn5ezewhe5bla4w72hpcrg5biyelrs4skyw.main.cpp:37:99: 错误:模板参数数目不对(不应是 4 个而应是 2 个)
37 | auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec);
```
so the following line have problem
```c++
// this line means that tmp_acc0_vec should be Vectorized<int64_t>, and it will convert it to Vectorized<int32_t>.
auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec);
```
The issue is that tmp_acc0_vec is of type Vectorized<int32_t>, but the template parameters expect it to be Vectorized<int64_t>. and it will convert it to a Vectorized<int32_t>. this is conflict. the conversion should not be exist for tmp_acc0_vec is already Vectorized<int32_t>.The following line hardcodes the output variable type to int64, which causes unnecessary and incorrect type conversions.
|
||
|
|
efc4b460b3 |
Add cascade sum support for Inductor CPP backend (#156296)
Fixes #154703 Add cascade summation support for Inductor CPP backend to improve precision for large size summation. Currently, Inductor CPP directly do reduction for sum. As shown in #154703, when the size of the sum is large and the number of parallel is small, direct reduction will cause an intolerable precision loss: ``` extern "C" void kernel(float* in_out_ptr0, const float* in_ptr0) { auto out_ptr0 = in_out_ptr0; { { float tmp_acc0 = 0; at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(3000000000L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(3000000000L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); tmp_acc0_vec = tmp_acc0_vec + tmp0; } } } tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float, 1>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec); out_ptr0[static_cast<int64_t>(0L)] = static_cast<float>(tmp_acc0); } } { { { auto tmp0 = out_ptr0[static_cast<int64_t>(0L)]; auto tmp1 = static_cast<float>(3000000000.0); auto tmp2 = tmp0 / tmp1; in_out_ptr0[static_cast<int64_t>(0L)] = tmp2; } } } } ``` After adding cascade sum support: ``` extern "C" void kernel(float* in_out_ptr0, const float* in_ptr0) { auto out_ptr0 = in_out_ptr0; { { float tmp_acc0 = 0; at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0); at::vec::Vectorized<float> masked_tmp_acc0_vec = at::vec::Vectorized<float>(0); CascadeSumHelper<float, 65536> scalar_cascade_helper0(static_cast<int64_t>(3000000000L)); CascadeSumHelper<at::vec::Vectorized<float>, 65536> cascade_helper0(static_cast<int64_t>(187500000L)); CascadeSumHelper<at::vec::Vectorized<float>, 65536> masked_cascade_helper0(static_cast<int64_t>(0L)); for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(3000000000L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(3000000000L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); tmp_acc0_vec = cascade_sum_combine(tmp0, &cascade_helper0); } } } tmp_acc0 = cascade_sum_final(&scalar_cascade_helper0); tmp_acc0_vec = cascade_sum_final(&cascade_helper0); masked_tmp_acc0_vec = cascade_sum_final(&masked_cascade_helper0); tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float, 1>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec + masked_tmp_acc0_vec); out_ptr0[static_cast<int64_t>(0L)] = static_cast<float>(tmp_acc0); } } { { { auto tmp0 = out_ptr0[static_cast<int64_t>(0L)]; auto tmp1 = static_cast<float>(3000000000.0); auto tmp2 = tmp0 / tmp1; in_out_ptr0[static_cast<int64_t>(0L)] = tmp2; } } } } ``` This will inevitably reduce performance when cascade sum is turned on. For the case shown in #154703: performance reduced by ~3%. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156296 Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel |
||
|
|
6f23f53599 |
[inductor] fix tensor.to(uint8) error when tensor src type is float (#157267)
The cpu inductor processes .to(torch.uint8) incorrectly, leading to numerical inconsistencies. The convert_float_to_int8 function may return incorrect results for negative inputs, such as -2.xx, when the data type is uint8_t, producing 0 instead of 255. This issue stems from the clamping logic; we should avoid converting min_val to uint8_t too early Fixes https://github.com/pytorch/pytorch/issues/156788 @leslie-fang-intel Pull Request resolved: https://github.com/pytorch/pytorch/pull/157267 Approved by: https://github.com/leslie-fang-intel |
||
|
|
17687eb792 |
[BE][4/6] fix typos in test/ (test/inductor/) (#157638)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157638 Approved by: https://github.com/yewentao256, https://github.com/jansel |
||
|
|
64f2ec77f8 |
[inductor] Fix fractional_max_pool2d 3D input causing assertion error (#156912)
Fixes #156682 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156912 Approved by: https://github.com/angelayi |
||
|
|
1eea2c4fe3 |
[Inductor][CPP] Fix perf regression of functorch_maml_omniglot (#156526)
**Summary** Fix the performance regression of `functorch_maml_omniglot` in TorchBench. The issue reported in [#151523](https://github.com/pytorch/pytorch/issues/151523) occurs only when a parallel reduction is performed under the vectorized loop and a scalar kernel is used for the tail loop. Previously, we addressed this regression in [#151887](https://github.com/pytorch/pytorch/pull/151887) by disabling all cases where a parallel reduction occurs under the vectorized loop. However, for `functorch_maml_omniglot`, we found that a masked vector kernel is used in the tail loop instead of the scalar kernel in the job of `inductor_torchbench_cpu_smoketest_perf`. In this PR, we refine the fix by excluding the cases where a masked vector kernel is used in the tail loop, rather than disabling all such scenarios. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156526 Approved by: https://github.com/CaoE |
||
|
|
f5e6e52f25 |
[BE][PYFMT] migrate PYFMT for test/inductor/ to ruff format (#148186)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/148186 Approved by: https://github.com/jansel |
||
|
|
159a39ad34 |
Add an option for cpp_wrapper to compile entry and kernel separately (#156050)
Fixes #156037. Compiling entry and kernel separately has a non-negligible impact on the performance. This PR is to add an option for cpp_wrapper to control whether to compile entry and kernel separately, and turn it off by default. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156050 Approved by: https://github.com/leslie-fang-intel, https://github.com/benjaminglass1, https://github.com/jansel |
||
|
|
eb2af14f8e |
[PT2][partitioners] Add aten.split to view_ops list [relanding #155424] (#155943)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/155943 Approved by: https://github.com/ShatianWang |
||
|
|
7ba6fb69e6 |
[Inductor][CPP] Enable vectorized fp8 E5M2 quant dequant (#153365)
**Summary** This PR enables the vectorization codegen with Inductor CPP backend for `FP8_E5M2` `quant` from `float32` and `dequant` to `float32`. **Test Plan** ``` python test/inductor/test_cpu_repro.py -k test_dequant_quant_lowering_fp8_e5m2 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/153365 Approved by: https://github.com/jansel, https://github.com/jgong5 ghstack dependencies: #152417, #152418, #153364 |
||
|
|
b77a6504fa |
[Inductor][CPP] Enable vectorized fp8 quant dequant (#152418)
**Summary** This PR enables the vectorization codegen with Inductor CPP backend for `FP8_E4M3` `quant` from `float32` and `dequant` to `float32`. **Test Plan** ``` python test/inductor/test_cpu_repro.py -k test_dequant_quant_lowering_fp8_e4m3 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/152418 Approved by: https://github.com/jansel, https://github.com/jgong5, https://github.com/CaoE ghstack dependencies: #152417 |
||
|
|
3742b7fb3a |
Treat dim=[] same as dim=None (#153570)
Fixes https://github.com/pytorch/pytorch/issues/153568 Pull Request resolved: https://github.com/pytorch/pytorch/pull/153570 Approved by: https://github.com/ngimel |
||
|
|
39c0b01970 |
[ez] Disable failing test in periodic no gpu no avx (#152698)
Failing on periodic after it was added in #152542
Ex
inductor/test_cpu_repro.py::CPUReproTests::test_tanh_atan2_use_decompose_tanh [GH job link](https://github.com/pytorch/pytorch/actions/runs/14775755628/job/41485185829) [HUD commit link](
|
||
|
|
6f6acb4128 |
[AOTI][CPU] Introduce config.cpp.use_decompose_tanh (#152542)
Summary: Previously D70489427 changed tanh impl to `.tanh()`, and this is causing some meta internal workload perf regression. This diff will introduce a config so we can set it based on need. Differential Revision: D73909371 Pull Request resolved: https://github.com/pytorch/pytorch/pull/152542 Approved by: https://github.com/desertfire |
||
|
|
68a7501dab |
[Inductor][CPP] Fix Codegen Issue when Parallel Reduction under the vectorization (#151887)
**Summary** Fixes [#151290](https://github.com/pytorch/pytorch/issues/151290) and [#151523](https://github.com/pytorch/pytorch/issues/151523), which are regressions introduced by [#144020](https://github.com/pytorch/pytorch/pull/144020). That PR enabled parallelization at the inner loop level. However, a currently unsupported case arises when parallel reduction occurs under the vectorization loop level, specifically in patterns like: ``` for vec_loop_level: do_parallel_reduction ``` In such cases, a temporary buffer `tmp_acc_array` is allocated for tail scalar kernels, and another temporary buffer `tmp_acc_array` is also defined for parallel reduction. This results in a conflict due to overlapping temporary buffers. This PR disables the problematic case to avoid the conflict until proper support is implemented. **Test Plan** ``` python test/inductor/test_flex_attention.py -k test_make_block_mask_cpu python test/inductor/test_cpu_repro.py -k test_parallel_reduction_vectorization ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/151887 Approved by: https://github.com/jansel |
||
|
|
9092dd2e82 |
[CI] Disable some tests that are failing in periodic (#150059)
Disabling some tests to restore periodic nogpu avx512 timeout: |
||
|
|
c36ac16da1 |
[Inductor] optimize welford reduction (#145061)
Fix https://github.com/pytorch/pytorch/issues/141541. Fix https://github.com/pytorch/pytorch/issues/142839. Fix https://github.com/pytorch/pytorch/issues/143182. **Summary:** In order to fix the issue that the accuracy of welford reduction is not good enough, we refer to the eager implementation, combine Welford algorithm with cascade sum to improve numerical stability. Specifically: 1. Use Welford algorithm to compute mean and variance. 2. Use cascade summation when computing sum over input for both mean and variance. I tested Inductor benchmark with this PR on CPU, no performance gains or regressions were seen. **Example:** Take https://github.com/pytorch/pytorch/issues/141541 as an example: ``` import torch import torch.nn as nn torch.manual_seed(0) class Model(nn.Module): def __init__(self): super().__init__() self.gn = nn.GroupNorm(num_groups=32, num_channels=32) def forward(self, x): return self.gn(x) model = Model().eval() c_model = torch.compile(model) x = torch.randn(1, 32, 128, 128, 128) with torch.no_grad(): output = model(x) c_output = c_model(x) print(torch.max(torch.abs(output - c_output))) print(torch.allclose(output, c_output, 1.3e-6, 1e-5)) ``` **logs** - before ``` tensor(7.0095e-05) False ``` - After ``` tensor(9.5367e-07) True ``` - on CUDA ``` tensor(1.4305e-06, device='cuda:0', grad_fn=<MaxBackward1>) True ``` **Generated code:** - before ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], ''' #include "/tmp/torchinductor_jiayisun/pi/cpicxudqmdsjh5cm4klbtbrvy2cxwr7whxl3md2zzdjdf3orvfdf.h" extern "C" void kernel(const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr0, float* out_ptr1, float* out_ptr2) { { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); Welford<at::vec::Vectorized<float>> masked_tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); static WeightRecp<at::vec::Vectorized<float>> wrecps0(static_cast<int64_t>(131072L)); for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(2097152L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + 2097152L*x0), static_cast<int64_t>(16)); tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp0, &wrecps0); } } } tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(masked_tmp_acc0_vec)); tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec)); out_ptr0[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0.mean); out_ptr1[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0.m2); } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(2097152L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + 2097152L*x0), static_cast<int64_t>(16)); auto tmp1 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp4 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp12 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp15 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp2 = at::vec::Vectorized<float>(tmp1); auto tmp3 = tmp0 - tmp2; auto tmp5 = static_cast<float>(2097152.0); auto tmp6 = tmp4 / tmp5; auto tmp7 = static_cast<float>(1e-05); auto tmp8 = decltype(tmp6)(tmp6 + tmp7); auto tmp9 = 1 / std::sqrt(tmp8); auto tmp10 = at::vec::Vectorized<float>(tmp9); auto tmp11 = tmp3 * tmp10; auto tmp13 = at::vec::Vectorized<float>(tmp12); auto tmp14 = tmp11 * tmp13; auto tmp16 = at::vec::Vectorized<float>(tmp15); auto tmp17 = tmp14 + tmp16; tmp17.store(out_ptr2 + static_cast<int64_t>(x1 + 2097152L*x0)); } } } } } } ''') ``` - After ``` cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*'], ''' #include "/tmp/torchinductor_jiayisun/ln/clnlak27xpvmq3klpqyj6xzyq2thf4ecrezve5ddy4f4xaz4sb7w.h" extern "C" void kernel(const float* in_ptr0, const float* in_ptr1, const float* in_ptr2, float* out_ptr0, float* out_ptr1, float* out_ptr2) { { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { { Welford<float> tmp_acc0 = Welford<float>(); Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); Welford<at::vec::Vectorized<float>> masked_tmp_acc0_vec = Welford<at::vec::Vectorized<float>>(); WelfordHelper<at::vec::Vectorized<float>> welford_helper0(static_cast<int64_t>(131072L)); static WelfordHelper<at::vec::Vectorized<float>> masked_welford_helper0(static_cast<int64_t>(0L)); for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(2097152L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + 2097152L*x0), static_cast<int64_t>(16)); tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp0, &welford_helper0); } } } tmp_acc0_vec = welford_combine(tmp_acc0_vec, &welford_helper0); masked_tmp_acc0_vec = welford_combine(masked_tmp_acc0_vec, &masked_welford_helper0); tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(masked_tmp_acc0_vec)); tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec)); out_ptr0[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0.mean); out_ptr1[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0.m2); } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(2097152L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + 2097152L*x0), static_cast<int64_t>(16)); auto tmp1 = out_ptr0[static_cast<int64_t>(x0)]; auto tmp4 = out_ptr1[static_cast<int64_t>(x0)]; auto tmp12 = in_ptr1[static_cast<int64_t>(x0)]; auto tmp15 = in_ptr2[static_cast<int64_t>(x0)]; auto tmp2 = at::vec::Vectorized<float>(tmp1); auto tmp3 = tmp0 - tmp2; auto tmp5 = static_cast<float>(2097152.0); auto tmp6 = tmp4 / tmp5; auto tmp7 = static_cast<float>(1e-05); auto tmp8 = decltype(tmp6)(tmp6 + tmp7); auto tmp9 = 1 / std::sqrt(tmp8); auto tmp10 = at::vec::Vectorized<float>(tmp9); auto tmp11 = tmp3 * tmp10; auto tmp13 = at::vec::Vectorized<float>(tmp12); auto tmp14 = tmp11 * tmp13; auto tmp16 = at::vec::Vectorized<float>(tmp15); auto tmp17 = tmp14 + tmp16; tmp17.store(out_ptr2 + static_cast<int64_t>(x1 + 2097152L*x0)); } } } } } } ''') ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/145061 Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/jansel |
||
|
|
b2862f1435 |
optimize the decomposition of aten.native_group_norm (#144733)
Summary: Optimize the decomposition of aten.native_group_norm. Reduce unnecessary repeated operations by changing the order of operations for `mean`, `rstd`, `weight`, `bias `and `input`, which can improve performance when `flattened_inner_size `is large. The original decomposition: 1. compute `mean `and `rstd`, 2. out = (x - mean) * rstd, compute in the range [N, C, *], 3. out = out * weight + bias, compute in the range [N, C, *], The new decomposition: 1. compute `mean `and `rstd`, 2. new_weight = rstd * weight, new_bias = - mean * rstd * weight + bias, compute in the range [N, C], 3. out = out * new_weight + new_bias, compute in the range [N, C, *], I tested the Inductor performance benchmark with this PR on both CPU and A100. On CPU, two torchbench models(functorch_dp_cifar10 and opacus_cifar10) have about 25% performance improvement, and two diffusion models(Stable Diffusion and Latent Consistency Model(LCM)) have about 2% performance improvement. On A100, no performance gains or regressions were seen. Pull Request resolved: https://github.com/pytorch/pytorch/pull/144733 Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel |
||
|
|
f349304c08 |
[Inductor][CPP] Fix expr issue in loop split (#148882)
**Summary** Fix issue: https://github.com/pytorch/pytorch/issues/148058. In this case, there is an `indexing_expr` as an integer which doesn't have the method of `find`. **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_repro.py -k test_issue_148058 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/148882 Approved by: https://github.com/jgong5 |
||
|
|
d6d670ab4d |
[AOTI] build CPU CPP kernels at O3, and all other code at O1 (#148587)
In the future, we may also want to add LTO linking to further optimize the results (while still hopefully netting compile time benefits). Differential Revision: [D70641543](https://our.internmc.facebook.com/intern/diff/D70641543) Pull Request resolved: https://github.com/pytorch/pytorch/pull/148587 Approved by: https://github.com/desertfire |
||
|
|
165e33531c |
[Inductor][CPP] Fix the vec codegen for tanh (#148254)
**Summary** Fix https://github.com/pytorch/pytorch/issues/148241, The previous vectorized code generation for `tanh` used a decomposed implementation, leading to numerical differences that were further amplified by `atan2`. For example, in the given test case after `tanh`, the eager output at `[0,0,11,47]` was `-5.820766091346741e-10`, while the compiled output was `1.4319084584712982e-08`, resulting in different `atan2` outputs of `-2.3561` and `0.7853`. This issue is fixed by switching to the Sleef implementation. **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_repro.py -k test_tanh_atan2 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/148254 Approved by: https://github.com/malfet, https://github.com/jgong5 |
||
|
|
d23051f29b |
[Inductor] Support parallel reduction for GroupNorm (#144020)
Summary:
Support parallel reduction for GroupNorm by optimizing the parallelization heuristics: When the range of the first inner loop is much larger than the range of all outer loops, change the starting depth of parallelization to the first inner loop.
I tested the Inductor benchmark with this PR on CPU. One torchbench model(pytorch_CycleGAN_and_pix2pix) achieved ~45% performance improvement, and two diffusion models(Stable Diffusion and Latent Consistency Model(LCM)) achieved ~2% performance improvement.
Example:
```
import torch
import torch.nn as nn
class GN(nn.Module):
def __init__(self, num_groups, num_channels):
super(GN, self).__init__()
self.gn = nn.GroupNorm(num_groups, num_channels)
def forward(self, x):
return self.gn(x)
x = torch.randn(2, 64, 168, 168).to(memory_format=torch.channels_last)
m = GN(2, 64).eval()
compiled_m = torch.compile(m)
with torch.no_grad():
out = compiled_m(x)
```
Generated code:
- Before:
```
cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_jiayisun/pi/cpicxudqmdsjh5cm4klbtbrvy2cxwr7whxl3md2zzdjdf3orvfdf.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4)
{
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
{
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2L); x1+=static_cast<int64_t>(1L))
{
{
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
Welford<at::vec::Vectorized<float>> masked_tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
static WeightRecp<at::vec::Vectorized<float>> wrecps0(static_cast<int64_t>(56448L));
for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(28224L); x2+=static_cast<int64_t>(1L))
{
for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(32L); x3+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x3 >= static_cast<int64_t>(0) && x3 < static_cast<int64_t>(32L)))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + 32L*x1 + 64L*x2 + 1806336L*x0), static_cast<int64_t>(16));
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp0, &wrecps0);
}
}
}
}
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(masked_tmp_acc0_vec));
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr0[static_cast<int64_t>(x1 + 2L*x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr1[static_cast<int64_t>(x1 + 2L*x0)] = static_cast<float>(tmp_acc0.m2);
}
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
{
#pragma GCC ivdep
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2L); x1+=static_cast<int64_t>(1L))
{
for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(32L)))
{
auto tmp0 = out_ptr1[static_cast<int64_t>(x1 + 2L*x0)];
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x2 + 32L*x1), static_cast<int64_t>(16));
auto tmp9 = out_ptr0[static_cast<int64_t>(x1 + 2L*x0)];
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x2 + 32L*x1), static_cast<int64_t>(16));
auto tmp1 = static_cast<float>(903168.0);
auto tmp2 = tmp0 / tmp1;
auto tmp3 = static_cast<float>(1e-05);
auto tmp4 = decltype(tmp2)(tmp2 + tmp3);
auto tmp5 = 1 / std::sqrt(tmp4);
auto tmp7 = at::vec::Vectorized<float>(tmp5);
auto tmp8 = tmp7 * tmp6;
auto tmp10 = decltype(tmp9)(-tmp9);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp11 * tmp8;
auto tmp14 = tmp12 + tmp13;
tmp8.store(out_ptr2 + static_cast<int64_t>(x2 + 32L*x1 + 64L*x0));
tmp14.store(out_ptr3 + static_cast<int64_t>(x2 + 32L*x1 + 64L*x0));
}
}
}
}
}
}
}
{
#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
{
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(28224L); x1+=static_cast<int64_t>(1L))
{
for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(64L)))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + 64L*x1 + 1806336L*x0), static_cast<int64_t>(16));
auto tmp1 = at::vec::Vectorized<float>::loadu(out_ptr2 + static_cast<int64_t>(x2 + 64L*x0), static_cast<int64_t>(16));
auto tmp3 = at::vec::Vectorized<float>::loadu(out_ptr3 + static_cast<int64_t>(x2 + 64L*x0), static_cast<int64_t>(16));
auto tmp2 = tmp0 * tmp1;
auto tmp4 = tmp2 + tmp3;
tmp4.store(out_ptr4 + static_cast<int64_t>(x2 + 64L*x1 + 1806336L*x0));
}
}
}
}
}
}
}
}
''')
```
- After:
```
cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
#include "/tmp/torchinductor_jiayisun/pi/cpicxudqmdsjh5cm4klbtbrvy2cxwr7whxl3md2zzdjdf3orvfdf.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0,
float* out_ptr1,
float* out_ptr2,
float* out_ptr3,
float* out_ptr4)
{
{
#pragma GCC ivdep
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
{
#pragma GCC ivdep
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2L); x1+=static_cast<int64_t>(1L))
{
{
Welford<float> tmp_acc0 = Welford<float>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
Welford<at::vec::Vectorized<float>> masked_tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
Welford<at::vec::Vectorized<float>> tmp_acc0_vec_arr[56];
for (int i = 0; i < 56; i++)
{
tmp_acc0_vec_arr[i] = Welford<at::vec::Vectorized<float>>();
}
Welford<float> tmp_acc0_arr[56];
for (int i = 0; i < 56; i++)
{
tmp_acc0_arr[i] = Welford<float>();
}
Welford<at::vec::Vectorized<float>> masked_tmp_acc0_vec_arr[56];
for (int i = 0; i < 56; i++)
{
masked_tmp_acc0_vec_arr[i] = Welford<at::vec::Vectorized<float>>();
}
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
static WeightRecp<at::vec::Vectorized<float>> wrecps0(static_cast<int64_t>(1008L));
Welford<at::vec::Vectorized<float>> tmp_acc0_vec_local = Welford<at::vec::Vectorized<float>>();
Welford<float> tmp_acc0_local = Welford<float>();
Welford<at::vec::Vectorized<float>> masked_tmp_acc0_vec_local = Welford<at::vec::Vectorized<float>>();
#pragma omp for
for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(28224L); x2+=static_cast<int64_t>(1L))
{
for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(32L); x3+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x3 >= static_cast<int64_t>(0) && x3 < static_cast<int64_t>(32L)))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + 32L*x1 + 64L*x2 + 1806336L*x0), static_cast<int64_t>(16));
tmp_acc0_vec_local = welford_combine(tmp_acc0_vec_local, tmp0, &wrecps0);
}
}
}
}
tmp_acc0_vec_arr[tid] = tmp_acc0_vec_local;
tmp_acc0_arr[tid] = tmp_acc0_local;
masked_tmp_acc0_vec_arr[tid] = masked_tmp_acc0_vec_local;
}
for (int tid = 0; tid < 56; tid++)
{
tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp_acc0_vec_arr[tid]);
}
for (int tid = 0; tid < 56; tid++)
{
tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]);
}
for (int tid = 0; tid < 56; tid++)
{
masked_tmp_acc0_vec = welford_combine(masked_tmp_acc0_vec, masked_tmp_acc0_vec_arr[tid]);
}
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(masked_tmp_acc0_vec));
tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
out_ptr0[static_cast<int64_t>(x1 + 2L*x0)] = static_cast<float>(tmp_acc0.mean);
out_ptr1[static_cast<int64_t>(x1 + 2L*x0)] = static_cast<float>(tmp_acc0.m2);
}
}
}
}
{
#pragma GCC ivdep
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
{
#pragma GCC ivdep
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2L); x1+=static_cast<int64_t>(1L))
{
for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(32L)))
{
auto tmp0 = out_ptr1[static_cast<int64_t>(x1 + 2L*x0)];
auto tmp6 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x2 + 32L*x1), static_cast<int64_t>(16));
auto tmp9 = out_ptr0[static_cast<int64_t>(x1 + 2L*x0)];
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x2 + 32L*x1), static_cast<int64_t>(16));
auto tmp1 = static_cast<float>(903168.0);
auto tmp2 = tmp0 / tmp1;
auto tmp3 = static_cast<float>(1e-05);
auto tmp4 = decltype(tmp2)(tmp2 + tmp3);
auto tmp5 = 1 / std::sqrt(tmp4);
auto tmp7 = at::vec::Vectorized<float>(tmp5);
auto tmp8 = tmp7 * tmp6;
auto tmp10 = decltype(tmp9)(-tmp9);
auto tmp11 = at::vec::Vectorized<float>(tmp10);
auto tmp12 = tmp11 * tmp8;
auto tmp14 = tmp12 + tmp13;
tmp8.store(out_ptr2 + static_cast<int64_t>(x2 + 32L*x1 + 64L*x0));
tmp14.store(out_ptr3 + static_cast<int64_t>(x2 + 32L*x1 + 64L*x0));
}
}
}
}
}
}
#pragma omp parallel num_threads(56)
{
int tid = omp_get_thread_num();
{
#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
{
for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(28224L); x1+=static_cast<int64_t>(1L))
{
for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(64L)))
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + 64L*x1 + 1806336L*x0), static_cast<int64_t>(16));
auto tmp1 = at::vec::Vectorized<float>::loadu(out_ptr2 + static_cast<int64_t>(x2 + 64L*x0), static_cast<int64_t>(16));
auto tmp3 = at::vec::Vectorized<float>::loadu(out_ptr3 + static_cast<int64_t>(x2 + 64L*x0), static_cast<int64_t>(16));
auto tmp2 = tmp0 * tmp1;
auto tmp4 = tmp2 + tmp3;
tmp4.store(out_ptr4 + static_cast<int64_t>(x2 + 64L*x1 + 1806336L*x0));
}
}
}
}
}
}
}
}
''')
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/144020
Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel, https://github.com/jgong5
|
||
|
|
fe3b9e3764 |
[Inductor] optimize the heuristics of outer loop fusion (#147523)
Summary: Optimize the heuristics of outer loop fusion: When the range of the first inner loop is much larger than the range of all outer loops, do not fuse the outer loops and fallback to standard codegen. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147523 Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel, https://github.com/jgong5 |
||
|
|
644d84d594 |
Revert "optimize the decomposition of aten.native_group_norm (#144733)"
This reverts commit
|
||
|
|
be830c8b1c |
[Inductor][CPP] fix store mode atomic add (#147961)
**Summary** Fix issue: https://github.com/pytorch/pytorch/issues/147848 and https://github.com/pytorch/pytorch/issues/146390. While addressing these issues, 2 problems were encountered: - In `CppVecKernel`, when the number of threads is 1 and the mode is `atomic_add`, `store` did not `load/add` before storing. This has been fixed in this PR. - In `CppTile2DKernel`, `store` did not support `atomic_add` mode. Support for this has been added in this PR. **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_repro.py -k test_nn_fold ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/147961 Approved by: https://github.com/malfet |
||
|
|
b533bb4b13 |
optimize the decomposition of aten.native_group_norm (#144733)
Summary: Optimize the decomposition of aten.native_group_norm. Reduce unnecessary repeated operations by changing the order of operations for `mean`, `rstd`, `weight`, `bias `and `input`, which can improve performance when `flattened_inner_size `is large. The original decomposition: 1. compute `mean `and `rstd`, 2. out = (x - mean) * rstd, compute in the range [N, C, *], 3. out = out * weight + bias, compute in the range [N, C, *], The new decomposition: 1. compute `mean `and `rstd`, 2. new_weight = rstd * weight, new_bias = - mean * rstd * weight + bias, compute in the range [N, C], 3. out = out * new_weight + new_bias, compute in the range [N, C, *], I tested the Inductor performance benchmark with this PR on both CPU and A100. On CPU, two torchbench models(functorch_dp_cifar10 and opacus_cifar10) have about 25% performance improvement, and two diffusion models(Stable Diffusion and Latent Consistency Model(LCM)) have about 2% performance improvement. On A100, no performance gains or regressions were seen. Pull Request resolved: https://github.com/pytorch/pytorch/pull/144733 Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel |
||
|
|
424c1b82e0 |
[Inductor][CPP] Add the legalize low fp support for index expr (#147298)
**Summary** Fix issue: https://github.com/pytorch/pytorch/issues/147279. The test case produced a low-precision floating-point value using `ops.index_expr`, but the CPP backend did not handle its legalization. This PR adds support for it. **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_repro.py -k test_low_fp_index_expr_issue_147279 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/147298 Approved by: https://github.com/jgong5 |
||
|
|
c24038025d |
[ROCm] Unskip std:bad_alloc failures (#146407)
Flakey MI300 issue related to memory usage should now be resolved after https://github.com/pytorch/pytorch/actions/runs/13007160888?pr=145829. Pull Request resolved: https://github.com/pytorch/pytorch/pull/146407 Approved by: https://github.com/jeffdaily |
||
|
|
e2a029054d |
cpp_wrapper: enable all CPU repro tests (#145655)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/145655 Approved by: https://github.com/desertfire ghstack dependencies: #145095, #145654 |
||
|
|
9873319a42 |
cpp_wrapper: fix set_.source_Tensor lowering (#145654)
Adds a C-shim fallback for `set_.source_Tensor`, which is effectively required by `ir.SetSourceTensorKernel`. As a necessary prerequisite to use that IR node, updates `CppWrapperCpu` to handle in-place returns in C-shim ops (the arguments for those returns are silently dropped by `torchgen`). Pull Request resolved: https://github.com/pytorch/pytorch/pull/145654 Approved by: https://github.com/desertfire ghstack dependencies: #145095 |
||
|
|
9728e900dc |
[Inductor][CPP] fix torch logit decomposition (#145576)
**Summary**
Fix issue https://github.com/pytorch/pytorch/issues/145379, current decomposition using `self = torch.clamp(self, lo, hi)` which gives wrong result when `lo` is larger than `hi` comparing to eager implementation:
|
||
|
|
40e27fbcf2 |
Refactor CPUReproTests to be more vector-length agnostic (#141245)
This changes the hardcoded assumptions of a `256-bit` vector length to querying from `cpu_vec_isa` and changes relevant tests to share the logic. Also refactored the `config.cpp.simdlen != 1` into the assertion so we stop duplicating it throughout the test cases. Fixes issues on `128-bit` machines. Pull Request resolved: https://github.com/pytorch/pytorch/pull/141245 Approved by: https://github.com/desertfire, https://github.com/malfet |
||
|
|
d4871750d9 |
[ROCm] Enable post-merge trunk workflow on MI300 runners; skip and fix MI300 related failed tests (#143673)
This PR * makes changes to the workflow files and scripts so we can run CI workflows on the MI300 runners * skips and fixes several tests, failed on MI300, observed in https://github.com/pytorch/pytorch/pull/140989 Skipped due to unsupported Float8_e4m3fn data type on MI300 (need to update test code to use datatypes supported by MI300): - distributed.tensor.parallel.test_micro_pipeline_tp.py::MicroPipelineTPTest::test_fuse_all_gather_scaled_matmul_A_dims_\*_gather_dim_\* (24 tests across inductor/distributed configs) - distributed.tensor.parallel.test_micro_pipeline_tp.py::test_fuse_scaled_matmul_reduce_scatter_A_dims_\*_scatter_dim_\* (12 tests across inductor/distributed configs)) - inductor.test_loop_ordering::LoopOrderingTest::test_fp8_cast_and_t - inductor.test_loop_ordering::LoopOrderingTest::test_fp8_pattern_2 Skipped due to AssertionError on MI300: - inductor.test_mkldnn_pattern_matcher.py::test_qconv2d_int8_mixed_bf16 - distributed._tools.test_sac_ilp::TestSACILP::test_sac_ilp_case1 Skipped: - test_cuda.py::TestCudaMallocAsync::test_clock_speed - test_cuda.py::TestCudaMallocAsync::test_power_draw - test_torch.py::TestTorchDeviceTypeCUDA::test_deterministic_cumsum_cuda Skipped flaky tests on MI300: - distributed.test_c10d_gloo.py::ProcessGroupGlooTest::test_gather_stress_cuda - inductor.test_cpu_repro::CPUReproTests::test_lstm_packed_unbatched_False* (256 tests) Fixed: - test_matmul_cuda.py::TestFP8MatmulCudaCUDA::test_float8_basics_cuda Features: - inductor/test_fp8.py - declare a new function to convert FP8 datatypes to ROCm supported FP8 datatypes. It keeps test names for CUDA and ROCm and allows to enable Inductor FP8 tests on CPU Pull Request resolved: https://github.com/pytorch/pytorch/pull/143673 Approved by: https://github.com/jeffdaily, https://github.com/malfet, https://github.com/pruthvistony Co-authored-by: saienduri <saimanas.enduri@amd.com> Co-authored-by: Jithun Nair <jithun.nair@amd.com> Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com> |
||
|
|
a5051a9521 |
Update torch.masked.mean to upcast dtype for bool tensors (#139999)
When calling `torch.masked.mean(...)` with a boolean tensor, the dtype is inferred to be bool. When the mean is being computed, the sum operator is used. When the sum operator is used with dtype=torch.bool, the result is clamped to True (1) leading to an incorrect mean being calculated. The below example shows how the incorrect result occurs: ``` a = torch.tensor([True, True]) count = torch.sum(torch.ones(a.shape, dtype=torch.int64)) # 2 total = torch.sum(a, dtype=torch.bool) # True (1) mean = total / count # 0.5 ``` This PR upcasts the dtype used for the sumation to int32 in the case of bool tensors allowing for the correct result to be computed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/139999 Approved by: https://github.com/cpuhrsch |
||
|
|
73a6a40346 |
[Inductor][CPP] Fix outer loop fusion buffer removed (#144243)
**Summary** Fix issue: https://github.com/pytorch/pytorch/issues/144186. For the test case reported in the issue, we have saw some nodes with `LoopNest` - `LoopNest(loops=[LoopLevel(var=x0, size=8, offset=0, tiled_size=0, steps=1, parallel=0, simd_omp=False, simd_vec=False, collapsed=False, is_reduction=False), LoopLevel(var=x1, size=8, offset=0, tiled_size=0, steps=1, parallel=0, simd_omp=False, simd_vec=False, collapsed=False, is_reduction=True)], kernel=<torch._inductor.codegen.cpp.CppKernelProxy object at 0x7fc724426680>)` - `LoopNest(loops=[LoopLevel(var=x0, size=8, offset=0, tiled_size=0, steps=16, parallel=0, simd_omp=False, simd_vec=True, collapsed=False, is_reduction=False), LoopLevel(var=x1, size=8, offset=0, tiled_size=0, steps=16, parallel=0, simd_omp=False, simd_vec=True, collapsed=False, is_reduction=True)], kernel=<torch._inductor.codegen.cpp.CppKernelProxy object at 0x7fc75c2cae60>)` Although, these 2 `LoopNest` have same `range` and `var`, but different `steps` 1 and 16. So, they will fail to be merged with outer loops. And since when we localize the buffer, we have removed the global buffers. We need to restore the status of `V.graph.removed_buffers` before fallback to codegen without outer loop fusion. **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_repro.py -k test_outer_loop_fusion_buffer_remove ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/144243 Approved by: https://github.com/jgong5 |
||
|
|
636a2c7e0f |
[Inductor][lowering] support out_dtype for dequant lowering (#143845)
In lowering, support the parameter `out_dtype` for `dequant_per_tensor` and `dequant_per_channel`. Fix the following runtime error issue found in https://github.com/pytorch/ao/pull/1372: ``` File "/home/liaoxuan/pytorch_ao/torch/_inductor/lowering.py", line 452, in wrapped out = decomp_fn(*args, **kwargs) torch._dynamo.exc.BackendCompilerFailed: backend='compile_fx_wrapper' raised: LoweringException: TypeError: quantized_decomposed_dequantize_per_tensor_default() got an unexpected keyword argument 'out_dtype' target: quantized_decomposed.dequantize_per_tensor.default args[0]: TensorBox(StorageBox( InputBuffer(name='arg0_1', layout=FixedLayout('cpu', torch.uint8, size=[1, 7, 7, 9], stride=[441, 63, 9, 1])) )) args[1]: 0.01 args[2]: 100 args[3]: 0 args[4]: 255 args[5]: torch.uint8 kwargs: {'out_dtype': torch.bfloat16} ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/143845 Approved by: https://github.com/jgong5, https://github.com/leslie-fang-intel, https://github.com/jansel |
||
|
|
c09bf71bd6 |
[Inductor][CPU] Fix C++ compile error of torch.max on bool type (#143848)
Fix https://github.com/pytorch/pytorch/issues/143568 Before:  After:  Pull Request resolved: https://github.com/pytorch/pytorch/pull/143848 Approved by: https://github.com/jgong5, https://github.com/leslie-fang-intel |