pytorch/torch/_inductor/decomposition.py
andrewor14 7b4f70eda5 Batch Norm Consolidation (#116092)
**Summary:**

This commit simplifies the existing decomposition hierarchy
of batch norm ops by adding a single, backend agnostic op:
`batch_norm_with_update`. The existing hierarchy looks like:

```
aten.batch_norm ->
aten._batch_norm_impl_index ->
[
  aten.native_batch_norm ->
  aten._native_batch_norm_legit (export only) ->
  _batch_norm_legit_cpu/cuda (kernels, export only) ->
  _batch_norm_cpu/cuda (kernels)
] OR
[ aten.cudnn_batch_norm ] OR
[ aten.miopen_batch_norm ]
```

Aside from complexity, an important problem with the
above decomposition hierarchy is cuda numerics in
export flows. We observed significantly worse convergence
when training a mobilenetv2-like model when using the
`_batch_norm_cuda` kernel instead of the `cudnn_batch_norm`
kernel. This means users who export their models on CPU
first then move the models to cuda later may silently
see worse accuracies even when cudnn is installed,
because they are using the worse kernel. This issue is
summarized in https://github.com/pytorch/pytorch/issues/111384.

Instead, the new hierarchy proposed by consolidating
existing batch norm ops will look like:

```
aten.batch_norm ->
aten.batch_norm_with_update ->
[ _batch_norm_cpu (kernel) ] OR
[ _batch_norm_cuda (kernel) ] OR
[ cudnn_batch_norm (kernel) ] OR
[ miopen_batch_norm (kernel) ]
```

The new op `batch_norm_with_update` hides backend
implementation details and automatically picks the right
kernel based on what is installed. This commit also adds
the following variants to this op:

```
batch_norm_with_update_functional
batch_norm_with_update.out
batch_norm_no_update
batch_norm_no_update.out
batch_norm_backward
```

Note that this commit only adds this op and its variants,
but does not actually change the decomps to produce these
ops in the graph. This will be done after the 2 week FC
window, and the ops used in the old stack is planned to
be removed after the 6 month BC window.

Test Plan: `OpInfo` tests for `batch_norm_with_update`.

Reviewers: albanD, bdhirsh

Subscribers: albanD, bdhirsh, supriyar

Tasks: https://github.com/pytorch/pytorch/issues/111384

Co-authored-by: Tugsbayasgalan Manlaibaatar <tmanlaibaatar@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/116092
Approved by: https://github.com/bdhirsh, https://github.com/albanD
2024-03-08 15:07:15 +00:00

683 lines
21 KiB
Python

import functools
import logging
import math
import sys
import typing
from typing import Optional
import torch
import torch._decomp as decomp
import torch._prims_common as utils
import torch.ao.quantization.fx._decomposed
from torch._decomp import (
core_aten_decompositions,
get_decompositions,
remove_decompositions,
)
from torch._decomp.decompositions import (
_grid_sampler_2d as decomp_grid_sampler_2d,
pw_cast_for_opmath,
)
from torch._decomp.decompositions_for_rng import extra_random_decomps
from torch._higher_order_ops.out_dtype import out_dtype
from torch._prims_common import (
elementwise_dtypes,
ELEMENTWISE_TYPE_PROMOTION_KIND,
type_to_dtype,
)
from . import config, inductor_prims
log = logging.getLogger(__name__)
aten = torch.ops.aten
prims = torch.ops.prims
quantized_decomposed = torch.ops.quantized_decomposed
inductor_decompositions = get_decompositions(
[
aten._adaptive_avg_pool2d_backward,
aten.arange,
aten.bitwise_and_,
aten.bitwise_or_,
aten.clamp_min_,
aten.dist,
aten.empty_like,
aten.flip,
aten.gelu,
aten.hardtanh,
aten.index_select,
aten.lcm,
aten.leaky_relu,
aten.linalg_vector_norm,
aten._log_softmax,
aten.max_pool2d_with_indices_backward,
aten._native_batch_norm_legit,
aten._native_batch_norm_legit_functional,
aten._native_batch_norm_legit_no_training,
aten._batch_norm_with_update,
aten._batch_norm_with_update_functional,
aten._batch_norm_no_update,
aten.batch_norm_backward,
aten.native_batch_norm,
aten.native_group_norm,
aten.native_layer_norm,
aten.nll_loss2d_backward,
aten._softmax,
aten.sin_,
aten.sqrt_,
out_dtype,
aten._to_copy,
aten.tril_indices,
aten.triu_indices,
aten.upsample_bilinear2d.vec,
]
)
decompositions = {**core_aten_decompositions(), **inductor_decompositions}
# Remove unwanted decompositions included via the core ATen decompositions from
# the Inductor decomp table.
decomps_to_exclude = [
aten._unsafe_index,
aten._scaled_dot_product_flash_attention_for_cpu.default, # See comments in torch/_decomp/decompositions.py
aten.clamp_max,
aten.clamp_min,
aten.glu, # inductor lowers this directly
aten.split.Tensor, # inductor lowers this directly
aten.squeeze, # inductor lowers this directly
aten.sum, # inductor lowers this directly
aten.unbind, # inductor lowers this directly
]
remove_decompositions(decompositions, decomps_to_exclude)
def register_decomposition(ops):
for op in [ops] if callable(ops) else ops:
if op in decompositions:
log.warning("duplicate decomp: %s", ops)
return decomp.register_decomposition(ops, decompositions)
# TODO: for now, inductor doesn't handle asserts
# because the condition is symbool -> tensor in the graph.
@register_decomposition([aten._assert_async.msg])
def assert_async_msg_decomp(tensor, msg):
return
# Following `assert_async_msg_decomp` and implement as non-op.
@register_decomposition([aten._functional_assert_async.msg])
def functional_assert_async_msg_decomp(tensor, msg):
return
@register_decomposition([aten.sym_constrain_range_for_size.default])
def sym_constrain_range_for_size(symbol, *, min=None, max=None):
return
@register_decomposition([aten.clamp])
@pw_cast_for_opmath
def clamp(x, min=None, max=None):
if min is not None:
x = x.clamp_min(min)
if max is not None:
x = x.clamp_max(max)
return x
@register_decomposition([aten.full])
def full(size, fill_value, **kwargs):
dtype = kwargs.get("dtype")
if dtype is None:
kwargs["dtype"] = type_to_dtype(type(fill_value))
return aten.full(size, fill_value, **kwargs)
return NotImplemented
# Not really sure how to put this into the main library. PrimTorch wants
# empty_permuted to go to the prim, and typically users don't really want
# to decompose to empty_strided (but inductor is OK with it, because we are
# cool with strides and everything goes to empty_strided)
@register_decomposition([aten.empty_permuted.default])
def empty_permuted(size, physical_layout, **kwargs):
perm = [0] * len(size)
for p, l in enumerate(physical_layout):
perm[l] = p
return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm)
@register_decomposition([aten.convolution_backward])
def convolution_backward(
grad_output,
input,
weight,
bias_sizes,
stride,
padding,
dilation,
transposed,
output_padding,
groups,
output_mask,
):
if not output_mask[2] or grad_output.device.type != "cuda":
return NotImplemented
grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
grad_inp, grad_weight, _ = aten.convolution_backward(
grad_output,
input,
weight,
bias_sizes,
stride,
padding,
dilation,
transposed,
output_padding,
groups,
[output_mask[0], output_mask[1], False],
)
return (grad_inp, grad_weight, grad_bias)
@register_decomposition([aten.log2])
def log2(x):
return torch.log(x) * (1.0 / math.log(2.0))
@register_decomposition([aten.round.decimals])
def round_dec(x, decimals=0):
ten_pow_decimals = 10.0**decimals
return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
@register_decomposition([aten.bmm])
@pw_cast_for_opmath
def bmm(self, batch2):
if config.coordinate_descent_tuning:
if self.shape[1] == 1 or batch2.shape[2] == 1:
out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2)
return out
if self.device.type == "cpu":
if self.size(1) == 1 and batch2.size(-1) == 1:
return torch.sum(
self.squeeze(1) * batch2.squeeze(-1), dim=1, keepdim=True
).unsqueeze(1)
return NotImplemented
@register_decomposition([aten.addmm])
@pw_cast_for_opmath
def addmm(self, mat1, mat2, beta=1, alpha=1):
if self.device.type == "cpu":
if mat1.size(0) == 1 and mat2.size(-1) == 1:
out = torch.sum(
mat1.squeeze(0) * mat2.squeeze(-1), dim=0, keepdim=True
).unsqueeze(0)
return alpha * out + beta * self
if mat1.size(0) == 1 and mat2.size(0) <= 16 and mat2.size(1) <= 16:
out = (mat1.T * mat2).sum(dim=0, keepdim=True)
return alpha * out + beta * self
return NotImplemented
@register_decomposition([aten.mm])
@pw_cast_for_opmath
def mm(self, input2):
from torch.fx.experimental.symbolic_shapes import (
definitely_true,
guard_size_oblivious,
)
# Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
# todo: Look into why and fix it (hopefully)
if config.coordinate_descent_tuning:
if self.shape[0] == 1 or input2.shape[1] == 1:
return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
if self.device.type == "cpu":
if (
guard_size_oblivious(self.size(-1) == 1)
and guard_size_oblivious(self.size(0) > 0)
and guard_size_oblivious(input2.size(0) == 1)
and (self.dtype == input2.dtype)
and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32)
):
return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
input2.size(-1) == 1
):
return torch.sum(
self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True
).unsqueeze(0)
return NotImplemented
# This pass does two things:
# - Eliminate cat when there is only one tensor input
# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we
# don't remove ALL empty tensors, only the naughty ones)
@register_decomposition([aten.cat.default])
def cat(tensors, dim=0):
from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
def non_empty_tensor(x):
# For better or worse, this is a valid cat:
#
# torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)])
#
# We'd like to eliminate naughtiness like this for downstream passes
# like split_cat. The easiest way is to just drop such inputs
# (guarding that they are non-zero).
#
# Is it permissible for this filtering to be size-oblivious? A case
# where this could matter is cat([(2, 2), (u0,)], dim=0); if u0
# happened to be zero, we would have liked to have filtered it out.
# But actually, the ONLY way this could have passed is if u0 == 0,
# so by the time we get here we have already installed a deferred
# runtime assert forcing u0 to be zero. So if this hasn't happened,
# we know that the unbacked SymInt has appropriate size and there are
# no problems.
return len(x.shape) != 1 or guard_size_oblivious(x.shape[0] > 0)
filtered_tensors = list(filter(non_empty_tensor, tensors))
if len(filtered_tensors) == 1:
return filtered_tensors[0].clone()
elif 1 < len(filtered_tensors) < len(tensors):
# on the first call, when we remove empty tensors, we redispatch recursively
return aten.cat.default(filtered_tensors, dim)
# when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
return NotImplemented
@register_decomposition([aten.angle])
def angle(x):
if x.is_complex():
return torch.where(
torch.isnan(x.real), float("nan"), torch.atan2(x.imag, x.real)
)
# when x is real number
# if x >= 0, return 0
# if x < 0, return pi
# if x is nan, return nan
_, dtype = elementwise_dtypes(
x,
type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
)
pi = torch.scalar_tensor(math.pi, dtype=dtype, device=x.device)
ret = torch.where(x < 0, pi, 0.0)
return torch.where(torch.isnan(x), float("nan"), ret)
@register_decomposition([aten.add])
def add(x, y, *, alpha=None):
x_is_complex_tensor = torch.is_tensor(x) and x.is_complex()
y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
if not x_is_complex_tensor or not y_is_complex_tensor:
return NotImplemented
z = y
if alpha is not None:
z = alpha * y
complex_type = torch.promote_types(x.dtype, y.dtype)
return (x.view(x.real.dtype) + z.view(y.real.dtype)).view(complex_type)
@register_decomposition([aten.conj_physical])
def conj_physical(self):
assert not self.is_complex(), "TODO: implement this"
return self
@register_decomposition([aten.lift, aten.detach_])
def lift(self):
return self
@register_decomposition([aten.bernoulli.default])
def bernoulli(self, *, generator=None):
assert generator is None
return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype)
@register_decomposition([aten.fmin, prims.fmin])
def fmin(self, other):
return torch.where(torch.isnan(other) | (other > self), self, other)
@register_decomposition([aten.fmax, prims.fmax])
def fmax(self, other):
return torch.where(torch.isnan(other) | (other < self), self, other)
@register_decomposition(aten.amax)
def amax(self, dim=None, keepdim=False):
if self.dtype == torch.bool:
return torch.any(self, dim=dim, keepdim=keepdim)
return NotImplemented
@register_decomposition(aten.amin)
def amin(self, dim=None, keepdim=False):
if self.dtype == torch.bool:
return torch.all(self, dim=dim, keepdim=keepdim)
return NotImplemented
@register_decomposition([aten.narrow_copy])
def narrow_copy(self, dim, start, length):
return torch.narrow(self, dim, start, length).clone()
@register_decomposition([aten.expand_copy])
def expand_copy(self, size, *, implicit=False):
return aten.expand(self, size, implicit=implicit).clone()
@register_decomposition([aten.view_copy.default])
def view_copy_default(self, size):
return aten.view(self, size).clone()
@register_decomposition([aten.view_copy.dtype])
def view_copy_dtype(self, dtype):
return self.to(dtype).clone()
def get_like_layout(
tensor: torch.Tensor, memory_format: Optional[torch.memory_format]
) -> torch.memory_format:
# TODO: _to_copy tensor to stride permutation
if memory_format is torch.preserve_format or memory_format is None:
return utils.suggest_memory_format(tensor)
else:
return memory_format
@register_decomposition(aten.rand_like)
def rand_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
return torch.rand(
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randn_like)
def randn_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
return torch.randn(
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.full_like)
def full_like(
self,
fill_value,
*,
dtype=None,
layout=None,
device=None,
pin_memory=False,
requires_grad=False,
memory_format=torch.preserve_format,
):
return torch.full(
[*self.size()],
fill_value,
dtype=dtype or self.dtype,
layout=layout or self.layout,
device=device or self.device,
requires_grad=requires_grad,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randint_like.default)
def randint_like(self, high, *, dtype=None, device=None, memory_format=None, **kwargs):
return aten.randint.low(
0,
high,
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randint_like.low_dtype)
def randint_like_low(
self, low, high, *, dtype=None, device=None, memory_format=None, **kwargs
):
return aten.randint.low(
low,
high,
[*self.size()],
dtype=dtype or self.dtype,
device=device or self.device,
**kwargs,
).to(memory_format=get_like_layout(self, memory_format))
@register_decomposition(aten.randint.default)
def randint(high, size, **kwargs):
return aten.randint.low(0, high, size, **kwargs)
# The difference between quantize_per_tensor.default and quantize_per_tensor.tensor is
# scale and zero_point is scalar or scalar tensor
@register_decomposition(quantized_decomposed.quantize_per_tensor.default)
def quantize_per_tensor_default_decomp_impl(
input: torch.Tensor,
scale: float,
zero_point: int,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
if input.dtype == torch.bfloat16:
input = input.to(torch.float32)
inv_scale = 1.0 / scale
return torch.clamp(
torch.round(input * inv_scale) + zero_point, quant_min, quant_max
).to(dtype)
# The difference between dequantize_per_tensor.default and dequantize_per_tensor.tensor is
# scale and zero_point is scalar or scalar tensor
@register_decomposition(quantized_decomposed.dequantize_per_tensor.default)
def dequantize_per_tensor_default_decomp_impl(
input: torch.Tensor,
scale: float,
zero_point: int,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
return (input.to(torch.float32) - zero_point) * scale
@register_decomposition(quantized_decomposed.quantize_per_tensor.tensor)
def quantize_per_tensor_tensor_decomp_impl(
input: torch.Tensor,
scale: torch.Tensor,
zero_point: torch.Tensor,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
if input.dtype == torch.bfloat16:
input = input.to(torch.float32)
inv_scale = 1.0 / scale
return torch.clamp(
torch.round(input * inv_scale) + zero_point, quant_min, quant_max
).to(dtype)
@register_decomposition(quantized_decomposed.dequantize_per_tensor.tensor)
def dequantize_per_tensor_tensor_decomp_impl(
input: torch.Tensor,
scale: torch.Tensor,
zero_point: torch.Tensor,
quant_min: int,
quant_max: int,
dtype: torch.dtype,
) -> torch.Tensor:
return (input.to(torch.float32) - zero_point.to(torch.int32)) * scale.to(
torch.float32
)
@register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack)
def q_embedding_bag_byte_unpack_decomp(packed):
def bitcast_u8_to_f32(u8):
x, y, z, w = (u8[..., n].to(torch.int32) for n in (0, 1, 2, 3))
if sys.byteorder == "little":
return (x + (y << 8) + (z << 16) + (w << 24)).view(torch.float32)[..., None]
else:
return ((x << 24) + (y << 16) + (z << 8) + w).view(torch.float32)[..., None]
scales = bitcast_u8_to_f32(packed[..., -8:-4])
offsets = bitcast_u8_to_f32(packed[..., -4:])
return packed[..., :-8].to(torch.float32) * scales + offsets
@register_decomposition([aten.grid_sampler_2d])
@pw_cast_for_opmath
def grid_sampler_2d(
a: torch.Tensor,
grid: torch.Tensor,
interpolation_mode: int = 0,
padding_mode: int = 0,
align_corners: bool = False,
) -> torch.Tensor:
# We do not expand the grid (_expand_grid=False) on cpu for performance reasons
# Experimenting locally it was found that compiled CUDA code is accelerated by ~5x
# and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2)
# However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first.
# Thus we apply this hack to not expand the grid for this case.
_expand_grid = not (
a.device == torch.device("cpu")
and interpolation_mode == 0
and a.is_contiguous(memory_format=torch.contiguous_format)
)
output = decomp_grid_sampler_2d(
a,
grid=grid,
interpolation_mode=interpolation_mode,
padding_mode=padding_mode,
align_corners=align_corners,
_expand_grid=_expand_grid,
)
return output
@register_decomposition(aten._foreach_addcmul.Scalar)
def _foreach_addcmul_scalar(self, left_tensors, right_tensors, scalar=1):
return aten._foreach_add.List(
self, aten._foreach_mul.List(left_tensors, right_tensors), alpha=scalar
)
@register_decomposition(aten._foreach_addcdiv.Scalar)
def _foreach_addcdiv_scalar(self, left_tensors, right_tensors, scalar=1):
return aten._foreach_add.List(
self, aten._foreach_div.List(left_tensors, right_tensors), alpha=scalar
)
@register_decomposition(aten._foreach_lerp.Scalar)
def _foreach_lerp_scalar(start_tensors, end_tensors, weight):
return aten._foreach_add.List(
start_tensors,
aten._foreach_mul.Scalar(
aten._foreach_sub.List(end_tensors, start_tensors), weight
),
)
@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd)
@register_decomposition(aten.miopen_batch_norm)
def miopen_batch_norm(
input: torch.Tensor,
weight: torch.Tensor,
bias: typing.Optional[torch.Tensor],
running_mean: typing.Optional[torch.Tensor],
running_var: typing.Optional[torch.Tensor],
training: bool,
exponential_average_factor: float,
epsilon: float,
):
a, b, c = aten.native_batch_norm(
input,
weight,
bias,
running_mean,
running_var,
training,
exponential_average_factor,
epsilon,
)
if training:
return (a, b, c)
return (
a,
weight.new_zeros((0,)),
weight.new_zeros((0,)),
)
@functools.lru_cache(None)
def fast_random_decomps():
return {**decompositions, **extra_random_decomps}
def select_decomp_table():
"""decomps can change based on config"""
if config.fallback_random:
return decompositions
return fast_random_decomps()
@register_decomposition(aten.masked_scatter)
def masked_scatter(self, mask, source):
if self.device.type == "cuda":
# This two-step algorithm is the same as eager CUDA, for eager CPU we
# use a 1-shot serial iteration.
self, mask = aten.broadcast_tensors([self, mask])
source_idx = mask.reshape(-1).cumsum(0) - 1
return inductor_prims.masked_scatter_with_index(self, mask, source_idx, source)
return NotImplemented
@register_decomposition(quantized_decomposed.choose_qparams.tensor)
def choose_qparams_tensor(
input: torch.Tensor, quant_min: int, quant_max: int, eps: float, dtype: torch.dtype
):
min_val, max_val = torch.aminmax(input)
scale = (max_val - min_val) / float(quant_max - quant_min)
scale = torch.max(scale, torch.Tensor([eps]))
zero_point = quant_min - torch.round(min_val / scale).to(torch.int)
zero_point = torch.clamp(zero_point, quant_min, quant_max)
return scale.to(torch.float64), zero_point.to(torch.int64)
@register_decomposition(aten.put)
def put(self, index, source, accumulate=False):
flattened = self.flatten()
flattened = torch.index_put(
flattened, [index], source.reshape(index.shape), accumulate
)
return flattened.reshape(self.shape)
@register_decomposition(aten.put_)
def put_(self, index, source, accumulate=False):
out = aten.put(self, index, source, accumulate=accumulate)
return self.copy_(out)