As the title, this PR enables vectorization for the situation when the the index_expr depends on vectorized itervar. There are two cases here:
1. The vectorized itervar has constant stride in the index_expr. We vectorize the index_expr with `Vectorized<int32>::arange` for this case.
2. Otherwise, we load the index_expr vector in a non-contiguous way with a loop.
Below is the generated code for the first case from the test `test_concat_inner_vec`. Here `x1` is the index_expr and depends on the vectorized itervar `x1`. It has constant stride 1. We vectorized it with arange. We use `all_zero` to implement a short-cut for masks to avoid unnecessary execution of nested masked regions which are invalid.
Before:
```c++
#pragma omp for collapse(2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(155L); x1+=static_cast<long>(1L))
{
auto tmp0 = c10::convert<long>(x1);
auto tmp1 = static_cast<long>(0);
auto tmp2 = tmp0 >= tmp1;
auto tmp3 = static_cast<long>(35);
auto tmp4 = tmp0 < tmp3;
auto tmp5 = [&]
{
auto tmp6 = in_ptr0[static_cast<long>(x1 + (35L*x0))];
return tmp6;
}
;
auto tmp7 = tmp4 ? tmp5() : static_cast<decltype(tmp5())>(0.0);
auto tmp8 = tmp0 >= tmp3;
auto tmp9 = static_cast<long>(155);
auto tmp10 = tmp0 < tmp9;
auto tmp11 = [&]
{
auto tmp12 = in_ptr1[static_cast<long>((-35L) + x1 + (120L*x0))];
return tmp12;
}
;
...
```
After:
```c++
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(32L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(144L); x1+=static_cast<long>(16L))
{
auto tmp0 = c10::convert<int>(x1);
auto tmp1 = at::vec::Vectorized<int32_t>::arange(tmp0, 1);
auto tmp2 = static_cast<int>(0);
auto tmp3 = at::vec::Vectorized<int>(tmp2);
auto tmp4 = to_float_mask(tmp1 >= tmp3);
auto tmp5 = static_cast<int>(35);
auto tmp6 = at::vec::Vectorized<int>(tmp5);
auto tmp7 = to_float_mask(tmp1 < tmp6);
auto tmp8 = [&]
{
auto tmp9 = masked_load(in_ptr0 + static_cast<long>(x1 + (35L*x0)), to_float_mask(tmp7));
return tmp9;
}
;
auto tmp10 =
[&]
{
if (all_zero(to_float_mask(tmp7)))
{
return at::vec::Vectorized<float>(static_cast<float>(0.0));
}
else
{
return decltype(tmp8())::blendv(at::vec::Vectorized<float>(static_cast<float>(0.0)), tmp8(), to_float_mask(tmp7));
}
}
()
;
...
```
Below is the generated code for the second case from the test case `test_expr_vec_non_contiguous`. Here, the index_expr is `31L + (63L*(c10::div_floor_integer(x1, 32L))) + (c10::div_floor_integer(x2, 32L))` which depends on the vectorized itervar `x2` and doesn't have constant stride. So, we load the index_expr vector with a loop. (In fact, this can be further optimized since the index_expr is invariant with the data points in the range [x2, x2+16). So it can be regarded as a scalar. This will be optimized in the follow-up PR.) The code uses `vector_lane_mask_check` to implement the masked version of non-contiguous load.
Before:
```c++
#pragma omp for collapse(2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(4L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(1024L); x1+=static_cast<long>(1L))
{
{
float tmp_acc0 = -std::numeric_limits<float>::infinity();
for(long x2=static_cast<long>(0L); x2<static_cast<long>(1024L); x2+=static_cast<long>(1L))
{
auto tmp0 = c10::convert<long>(31L + (63L*(c10::div_floor_integer(x1, 32L))) + (c10::div_floor_integer(x2, 32L)));
auto tmp1 = static_cast<long>(2048);
auto tmp2 = tmp0 < tmp1;
auto tmp3 = [&]
{
auto tmp4 = in_ptr0[static_cast<long>(31L + (63L*(c10::div_floor_integer(x1, 32L))) + (2048L*(static_cast<long>(x1) % static_cast<long>(32L))) + (65536L*x0) + (c10::div_floor_integer(x2, 32L)))];
return tmp4;
}
;
auto tmp5 = tmp2 ? tmp3() : static_cast<decltype(tmp3())>(0.0);
tmp_acc0 = max_propagate_nan(tmp_acc0, tmp5);
}
out_ptr0[static_cast<long>(x1 + (1024L*x0))] = tmp_acc0;
}
}
}
```
After:
```c++
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(4L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(1024L); x1+=static_cast<long>(16L))
{
{
#pragma omp declare reduction(max:at::vec::Vectorized<float>:omp_out = at::vec::maximum(omp_out, omp_in)) initializer(omp_priv={at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity())})
float tmp_acc0 = -std::numeric_limits<float>::infinity();
at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
for(long x2=static_cast<long>(0L); x2<static_cast<long>(1024L); x2+=static_cast<long>(1L))
{
auto tmp0 =
[&]
{
__at_align__ std::array<int, 16> tmpbuf;
#pragma GCC unroll 16
for (long x1_inner = 0; x1_inner < 16; x1_inner++)
{
tmpbuf[x1_inner] = static_cast<long>(31L + (63L*(c10::div_floor_integer((x1 + x1_inner), 32L))) + (c10::div_floor_integer(x2, 32L)));
}
return at::vec::Vectorized<int>::loadu(tmpbuf.data());
}
()
;
auto tmp1 = static_cast<int>(2048);
auto tmp2 = at::vec::Vectorized<int>(tmp1);
auto tmp3 = to_float_mask(tmp0 < tmp2);
auto tmp4 = [&]
{
auto tmp5 =
[&]
{
__at_align__ std::array<float, 16> tmpbuf;
#pragma GCC unroll 16
for (long x1_inner = 0; x1_inner < 16; x1_inner++)
{
if (vector_lane_mask_check(tmp3, x1_inner))
{
tmpbuf[x1_inner] = in_ptr0[static_cast<long>(31L + (63L*(c10::div_floor_integer((x1 + x1_inner), 32L))) + (2048L*(static_cast<long>((x1 + x1_inner)) % static_cast<long>(32L))) + (65536L*x0) + (c10::div_floor_integer(x2, 32L)))];
}
}
return at::vec::Vectorized<float>::loadu(tmpbuf.data());
}
()
;
return tmp5;
}
;
auto tmp6 =
[&]
{
if (all_zero(to_float_mask(tmp3)))
{
return at::vec::Vectorized<float>(static_cast<float>(0.0));
}
else
{
return decltype(tmp4())::blendv(at::vec::Vectorized<float>(static_cast<float>(0.0)), tmp4(), to_float_mask(tmp3));
}
}
()
;
tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp6);
}
tmp_acc0_vec.store(out_ptr0 + static_cast<long>(x1 + (1024L*x0)));
}
}
}
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/114545
Approved by: https://github.com/lezcano
For embedding lookup, there are indirect indexing with indices that are invariant to the vectorized itervar. To vectorize it, we need to keep the related indexing variables as scalars and allow vectorization when the related index_exprs are invariant to the vectorized itervar.
This PR adds the support by lazily broadcasting scalar values (index_expr and constant) to vectors so that vector operations are only generated if needed by `CppVecKernel` when any of the inputs are vectors, otherwise, scalar ops are generated. The cse variable in cpp is now represented with `CppCSEVariable` which bookkeeps the relevant itervars to the variable and has a flag to mark whether it is a scalar or a vector. `CppVecOverrides` is improved to propagate these states when the ops are executed.
For the added UT `test_embedding_vec`, the generated code before this PR is:
```c++
extern "C" void kernel(const long* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0)
{
#pragma omp parallel num_threads(64)
{
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(128L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(128L); x1+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp5 = in_ptr2[static_cast<long>(x1 + (128L*x0))];
auto tmp1 = decltype(tmp0)(tmp0 + 64);
auto tmp2 = tmp0 < 0;
auto tmp3 = tmp2 ? tmp1 : tmp0;
TORCH_CHECK((0 <= tmp3) & (tmp3 < 64L), "index out of bounds: 0 <= tmp3 < 64L")
auto tmp4 = in_ptr1[static_cast<long>(x1 + (128L*tmp3))];
auto tmp6 = decltype(tmp4)(tmp4 + tmp5);
out_ptr0[static_cast<long>(x1 + (128L*x0))] = tmp6;
}
}
}
}
}
```
After this PR, we have:
```c++
extern "C" void kernel(const long* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0)
{
#pragma omp parallel num_threads(64)
{
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(128L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(128L); x1+=static_cast<long>(16L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1 + (128L*x0)));
auto tmp1 = decltype(tmp0)(tmp0 + 64);
auto tmp2 = tmp0 < 0;
auto tmp3 = tmp2 ? tmp1 : tmp0;
TORCH_CHECK((0 <= tmp3) & (tmp3 < 64L), "index out of bounds: 0 <= tmp3 < 64L")
auto tmp4 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (128L*tmp3)));
auto tmp6 = tmp4 + tmp5;
tmp6.store(out_ptr0 + static_cast<long>(x1 + (128L*x0)));
}
}
}
}
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/114062
Approved by: https://github.com/jansel
For embedding lookup, there are indirect indexing with indices that are invariant to the vectorized itervar. To vectorize it, we need to keep the related indexing variables as scalars and allow vectorization when the related index_exprs are invariant to the vectorized itervar.
This PR adds the support by lazily broadcasting scalar values (index_expr and constant) to vectors so that vector operations are only generated if needed by `CppVecKernel` when any of the inputs are vectors, otherwise, scalar ops are generated. The cse variable in cpp is now represented with `CppCSEVariable` which bookkeeps the relevant itervars to the variable and has a flag to mark whether it is a scalar or a vector. `CppVecOverrides` is improved to propagate these states when the ops are executed.
For the added UT `test_embedding_vec`, the generated code before this PR is:
```c++
extern "C" void kernel(const long* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0)
{
#pragma omp parallel num_threads(64)
{
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(128L); x0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long x1=static_cast<long>(0L); x1<static_cast<long>(128L); x1+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp5 = in_ptr2[static_cast<long>(x1 + (128L*x0))];
auto tmp1 = decltype(tmp0)(tmp0 + 64);
auto tmp2 = tmp0 < 0;
auto tmp3 = tmp2 ? tmp1 : tmp0;
TORCH_CHECK((0 <= tmp3) & (tmp3 < 64L), "index out of bounds: 0 <= tmp3 < 64L")
auto tmp4 = in_ptr1[static_cast<long>(x1 + (128L*tmp3))];
auto tmp6 = decltype(tmp4)(tmp4 + tmp5);
out_ptr0[static_cast<long>(x1 + (128L*x0))] = tmp6;
}
}
}
}
}
```
After this PR, we have:
```c++
extern "C" void kernel(const long* in_ptr0,
const float* in_ptr1,
const float* in_ptr2,
float* out_ptr0)
{
#pragma omp parallel num_threads(64)
{
{
#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(128L); x0+=static_cast<long>(1L))
{
for(long x1=static_cast<long>(0L); x1<static_cast<long>(128L); x1+=static_cast<long>(16L))
{
auto tmp0 = in_ptr0[static_cast<long>(x0)];
auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<long>(x1 + (128L*x0)));
auto tmp1 = decltype(tmp0)(tmp0 + 64);
auto tmp2 = tmp0 < 0;
auto tmp3 = tmp2 ? tmp1 : tmp0;
TORCH_CHECK((0 <= tmp3) & (tmp3 < 64L), "index out of bounds: 0 <= tmp3 < 64L")
auto tmp4 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (128L*tmp3)));
auto tmp6 = tmp4 + tmp5;
tmp6.store(out_ptr0 + static_cast<long>(x1 + (128L*x0)));
}
}
}
}
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/114062
Approved by: https://github.com/jansel
ghstack dependencies: #113950
This was originally @jansel's PR:
https://github.com/pytorch/pytorch/pull/102625, which I've built upon.
This diff implements static memory planning. It's disabled by default
while we examine its performance.
We use a greedy-by-size approach. For dynamic shapes, the sizes of the
example inputs are used as estimates when making planning decisions. We
generate expressions to calculate the actual memory offsets and sizes at
runtime when the values of the dynamic shapes are known. In order to
simplify these calculations, we have organized the allocations into a
tree that branches on space (address offsets) and time (live ranges).
Finally, we need to align these offsets, so we have added an `align`
sympy Expr to express these calculations.
Some limitations:
1. It is only enabled during inference for now. Enabling it for training
increases peak memory usage as we allocate all the memory needed for
training upfront, before freeing the memory allocated during
inference. We can probably address this by doing planning for both
the inference and training passes together.
2. It doesn't work with PyTorch Distributed, because kernels like
AllGatherIntoTensor codegen strings which do memory operations. We
can fix this down the line by having them emit MemoryPlanningLines
instead.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112178
Approved by: https://github.com/desertfire, https://github.com/jansel
Fixes https://github.com/pytorch/pytorch/issues/112449
elementwise_type_promotion_wrapper will promote `aten.normal` to the dtypes of `mean`, `std` args.
But this is incorrect if we provide the dtype param. Hence, we allow overriding the result_dtype if a specified dtype arg is available.
This problem is unique to `aten.normal`, all other ops decorated do not have a dtype param.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112467
Approved by: https://github.com/lezcano
This was originally @jansel's PR:
https://github.com/pytorch/pytorch/pull/102625, which I've built upon.
This diff implements static memory planning. It's disabled by default
while we examine its performance.
We use a greedy-by-size approach. For dynamic shapes, the sizes of the
example inputs are used as estimates when making planning decisions. We
generate expressions to calculate the actual memory offsets and sizes at
runtime when the values of the dynamic shapes are known. In order to
simplify these calculations, we have organized the allocations into a
tree that branches on space (address offsets) and time (live ranges).
Finally, we need to align these offsets, so we have added an `align`
sympy Expr to express these calculations.
Some limitations:
1. It is only enabled during inference for now. Enabling it for training
increases peak memory usage as we allocate all the memory needed for
training upfront, before freeing the memory allocated during
inference. We can probably address this by doing planning for both
the inference and training passes together.
2. It doesn't work with PyTorch Distributed, because kernels like
AllGatherIntoTensor codegen strings which do memory operations. We
can fix this down the line by having them emit MemoryPlanningLines
instead.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112178
Approved by: https://github.com/desertfire, https://github.com/jansel
Fixes#110611.
The current Torchinductor's `log` implementations will call `sleef` functions in `aten::Vec` which show worse performance than Aten's `log` implementations that invoke `MKL` functions. The reason is that the `sleef` algorithms sacrifice performance in order to have a higher precision. This PR changes Torchinductor's `log` implementations from the `sleef` functions with `1.0` ULP error bound to the ones with `3.5` ULP error bound.
**Performance**
Machine: ICX
The original perf number, perf with `Sleef_logf16_u10`:
```bash
numactl -C0 python test.py
log
eager: 368.8463559374213
compiled: 616.8672097846866
logit
eager: 565.499295014888
compiled: 1010.4096410796046
```
Perf with `Sleef_logf16_u35`:
```bash
numactl -C0 python test.py
log
eager: 364.8629770614207
compiled: 360.2141812443733
logit
eager: 562.3160391114652
compiled: 545.2622110024095
```
**Accuracy**
error_bound | tol=1e-6 | tol=1e-7
-- | -- | --
1.0 ULP | PASS | FAIL
3.5 ULP | PASS | FAIL
Pull Request resolved: https://github.com/pytorch/pytorch/pull/111898
Approved by: https://github.com/jgong5, https://github.com/desertfire, https://github.com/jansel
Improves perf of llama_v2 locally from 1.55 -> 1.57
The initial heuristic is to lower to pointwise if # of inputs is <= 4, and all the inputs are pointwise or cannot be memory planned away, or if all the outputs are pointwise.
Perf run was +3% on inference.. There are definitely instances where we should be lowering to foreach_kernels, but it's less flexible for fusion. The motivating example was:
```
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin):
iota = torch.ops.prims.iota.default(512, start = 0, step = 1, dtype = torch.int64, device = device(type='cuda', index=0), requires_grad = False)
# File: /scratch/eellison/work/torchdynamo/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py:657, code: position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
unsqueeze = torch.ops.aten.unsqueeze.default(iota, 0)
position_ids = torch.ops.aten.reshape.default(unsqueeze, [-1, 512]); unsqueeze = None
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
```
Also not sure if I should be more worried about concatting reduction->pointwise inputs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/111233
Approved by: https://github.com/Chillee
For inductor cpu backend, the scatter_add will use ```atomic_add```, which get a worse performance, currently, we make fallback for it to avoid performance regression compared with eager mode(single socket of SKX):
```
basic_gnn_gin 1.16x(after) Vs 0.509x(before)
basic_gnn_sage 1.064x(after) Vs 0.496x (before)
basic_gnn_gcn 1.373x(aftre) Vs 0.720x(before)
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/108220
Approved by: https://github.com/jgong5, https://github.com/desertfire
For max_pooling code:
```
#pragma GCC ivdep
for(long i2=static_cast<long>(0L); i2<static_cast<long>(56L); i2+=static_cast<long>(1L))
{
for(long i3=static_cast<long>(0L); i3<static_cast<long>(64L); i3+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int>(static_cast<int>((-1L) + (2L*i1)));
auto tmp1 = at::vec::Vectorized<int>(static_cast<int>(0));
auto tmp2 = to_float_mask(tmp0 >= tmp1);
auto tmp3 = at::vec::Vectorized<int>(static_cast<int>(112));
auto tmp4 = to_float_mask(tmp0 < tmp3);
auto tmp5 = tmp2 & tmp4;
auto tmp6 = at::vec::Vectorized<int>(static_cast<int>((-1L) + (2L*i2)));
auto tmp7 = to_float_mask(tmp6 >= tmp1);
auto tmp8 = to_float_mask(tmp6 < tmp3);
auto tmp9 = tmp7 & tmp8;
auto tmp10 = tmp5 & tmp9;
auto tmp11 = [&]
{
auto tmp12 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>((-7232L) + i3 + (128L*i2) + (14336L*i1) + (802816L*i0)), 16);
load
auto tmp13 = cvt_lowp_fp_to_fp32<bfloat16>(tmp12);
return tmp13;
}
;
auto tmp14 = decltype(tmp11())::blendv(at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()), tmp11(), to_float_mask(tmp10));
```
the index of ```tmp12 ``` may be a correct index, such as ```i1=0, i2=0, i3=0```, the index is ```-7232L```, it is not a valid index. We may meet segmentation fault error when we call ```tmp11()```, the original behavior is that only the ```tmp10```(index check variable) is true, we can safely get the value, this PR will support masked_load to fixing this issue.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/107670
Approved by: https://github.com/jgong5, https://github.com/jansel
Fix cpp wrapper failure on `clip` in Torchbench:
```
RuntimeError: tensor does not have a device
```
An `optional<at::Tensor>` variable with value equal to `at::Tensor()` will be considered as _contains value_. When it's converted to `bool`, it returns `true`. While for `None` in python, when converting it to `bool`, `false` is returned.
Fix it to be an optional variable that _does not contain a value_.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/106847
Approved by: https://github.com/jgong5, https://github.com/jansel
D47969512 was the original diff to revert this, but the diff train doesn't work well, so I have to split it into two part: this OSS PR and another separate diff to revert the fbcode change.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/106562
Approved by: https://github.com/angelayi