CUDA BFloat div, addcdiv, addcmul, mean, var (#44758)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44758 Reviewed By: mruberry Differential Revision: D23752317 Pulled By: ngimel fbshipit-source-id: 77992cf991f4e2b4b6839de73ea7e6ce2e1061c6
2025-12-06 12:20:52 +01:00 · 2020-09-18 11:47:31 -07:00 · 2020-09-18 11:47:31 -07:00 · 7bd8a6913d
commit 7bd8a6913d
parent f175830558
3 changed files with 19 additions and 26 deletions
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@ -10,24 +10,20 @@ namespace at { namespace native {

 void addcmul_cuda_kernel(TensorIterator& iter, Scalar value) {
  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "addcmul_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "addcmul_cuda", [&] {
    auto alpha = value.to<scalar_t>();
    gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
      return a + alpha * b * c;
    });
  });
-  });
 }

 void addcdiv_cuda_kernel(TensorIterator& iter, Scalar value) {
  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "addcdiv_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "addcdiv_cuda", [&] {
    auto alpha = value.to<scalar_t>();
    gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
      return a + alpha * (b / c);
    });
  });
-  });
 }

 void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm) {
--- a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
@ -30,10 +30,8 @@ void std_var_kernel_impl<at::BFloat16>(TensorIterator& iter, bool unbiased, bool

 static void std_var_kernel_cuda(TensorIterator& iter, bool unbiased, bool take_sqrt) {
  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "std_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "std_cuda", [&] {
    std_var_kernel_impl<scalar_t>(iter, unbiased, take_sqrt);
  });
-  });
 }

 template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
@ -49,14 +47,12 @@ static void mean_kernel_cuda(TensorIterator& iter) {
    // type promotion that does cast and reduction in a single kernel
    return mean_kernel_impl<at::Half, float, float>(iter);
  }
-  #ifdef __HIP_PLATFORM_HCC__
  else if(iter.dtype() == kBFloat16) {
    return mean_kernel_impl<at::BFloat16, float>(iter);
  } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) {
    // type promotion that does cast and reduction in a single kernel
    return mean_kernel_impl<at::BFloat16, float, float>(iter);
  }
-  #endif
  AT_DISPATCH_ALL_TYPES(iter.dtype(), "mean_cuda", [&]() {
    mean_kernel_impl<scalar_t>(iter);
  });
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -19787,20 +19787,20 @@ tensor_op_tests = [
    ('mul', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d)], 1e-2),
    ('mul', 'scalar', _small_0d, lambda t, d: [_small_0d(torch.int32, d)], 1e-2),
    ('div', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-1,
-        1e-1, 1e-5, _float_types2),
+        1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
    ('div', 'tensor', _small_3d,
        lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-1,
-        1e-1, 1e-5, _float_types2),
+        1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
    ('true_divide', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-1,
        1e-5, 1e-5, _types, _cpu_types, False),
    ('true_divide', 'with_inplace', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-1,
-        1e-1, 1e-5, _float_types2),
+        1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
    ('true_divide', 'tensor', _small_3d,
        lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-1,
        1e-5, 1e-5, _types, _cpu_types, False),
    ('true_divide', 'tensor_with_inplace', _small_3d,
        lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-1,
-        1e-1, 1e-5, _float_types2),
+        1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
    ('floor_divide', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1, 1e-5, 1e-5, _types),
    ('floor_divide', 'tensor', _small_3d,
        lambda t, d: [_small_3d(t, d, has_zeros=False)], 1, 1e-5, 1e-5, _types),
@ -19834,15 +19834,16 @@ tensor_op_tests = [
    ('addcdiv', '', _small_2d,
        lambda t, d: [_small_2d(t, d),
                      _small_2d(t, d, has_zeros=False)], 1, 1, 1e-3,
-        _float_types2, _cpu_types, True),
+        torch.testing.get_all_fp_dtypes(), _cpu_types, True),
    ('addcdiv', 'scalar', _small_2d,
        lambda t, d: [_number(2.8, 1, t), _small_2d(t, d),
                      _small_2d(t, d, has_zeros=False)], 1, 1e-5, 1e-3,
        _float_types, _cpu_types, True),
-    ('addcmul', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], 1e-2, 1e-1, 1e-3, _types2),
+    ('addcmul', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], 1e-2, 1e-1, 1e-3,
+        torch.testing.get_all_dtypes(include_complex=False, include_bool=False)),
    ('addcmul', 'scalar', _small_3d,
        lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], 1e-2,
-        1e-1, 1e-5, _types2, _cpu_types, True,
+        1e-1, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, True,
        [_wrap_maybe_warns("This overload of addcmul_? is deprecated")]),
    ('addmm', '', _medium_2d, lambda t, d: [_medium_2d(t, d), _medium_2d(t, d)],
        1e-1, 1e-1, 1e-4, _float_types2, _cpu_types, True, [tf32_on_and_off(0.005)], 0, True),
@ -19957,9 +19958,9 @@ tensor_op_tests = [
        1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
    ('minimum', '', _medium_2d, lambda t, d: [_medium_2d(t, d)],
        1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('mean', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types2, _cpu_types, False),
-    ('mean', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-2, 1e-5, _float_types2, _cpu_types, False),
-    ('mean', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-2, 1e-2, _float_types2, _cpu_types, False),
+    ('mean', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
+    ('mean', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
+    ('mean', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-2, 1e-2, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
    # Double here because the CPU result will be wrong otherwise
    ('mean', '64bit_indexing', _giant_1d, lambda t, d: [],
        1e-3, 1e-5, 1e-5, [torch.double], _cpu_types, False, [slowTest]),
@ -19983,7 +19984,7 @@ tensor_op_tests = [
    ('std', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
    ('var', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
    ('var', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
-    ('var', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-2, 1e-5, _float_types2, _cpu_types, False),
+    ('var', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False),
    ('ndimension', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
    ('nelement', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
    ('numel', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),