Revert "torch._scaled_mm with MXFP8 (#147548)"

This reverts commit 12b9674cb6. Reverted https://github.com/pytorch/pytorch/pull/147548 on behalf of https://github.com/wdvr due to failing internal build - similar to previous, see below ([comment](https://github.com/pytorch/pytorch/pull/147548#issuecomment-2684134336))
2025-12-06 12:20:52 +01:00 · 2025-02-26 07:17:24 +00:00 · 2025-02-26 07:17:24 +00:00 · a84db75e1b
commit a84db75e1b
parent 4216478250
7 changed files with 16 additions and 463 deletions
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -14,7 +14,6 @@
 #include <c10/macros/Export.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <c10/core/ScalarType.h>
 #ifdef USE_ROCM
 #include <hipblaslt/hipblaslt-ext.hpp>
@ -1504,12 +1503,10 @@ void scaled_gemm(
    const void* mat1_scale_ptr,
    int64_t mat1_ld,
    ScalarType mat1_dtype,
    ScalarType mat1_scale_dtype,
    const void* mat2_ptr,
    const void* mat2_scale_ptr,
    int64_t mat2_ld,
    ScalarType mat2_dtype,
    ScalarType mat2_scale_dtype,
    const void* bias_ptr,
    ScalarType bias_dtype,
    void* result_ptr,
@ -1537,8 +1534,10 @@ void scaled_gemm(
  // rowwise isn't supported using cublaslt or older hipblaslt
  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
 #endif
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+  {
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
  }
  if (result_scale_ptr != nullptr) {
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
  }
@ -1561,15 +1560,6 @@ void scaled_gemm(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
  }
  if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
 #if CUDA_VERSION >= 12080
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
 #else
    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
 #endif // CUDA_VERSION >= 12080
  }
  auto stream = c10::cuda::getCurrentCUDAStream();
  size_t workspaceSize = 0;
  auto workspace_ptr = _getWorkspace(workspaceSize);
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -130,12 +130,10 @@ void scaled_gemm(
    const void* mat1_scale_ptr,
    int64_t mat1_ld,
    ScalarType mat1_dtype,
    ScalarType mat1_scale_dtype,
    const void* mat2_ptr,
    const void* mat2_scale_ptr,
    int64_t mat2_ld,
    ScalarType mat2_dtype,
    ScalarType mat2_scale_dtype,
    const void* bias_ptr,
    ScalarType bias_dtype,
    void* result_ptr,
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -10,7 +10,6 @@
 #pragma once
 #include <string>
 #include <c10/core/ScalarType.h>
 #include <ATen/cuda/tunable/TunableOp.h>
 #include <ATen/cuda/CUDABlas.h>
@ -425,12 +424,10 @@ struct ScaledGemmParams : OpParams {
  const void* a_scale_ptr{};
  int64_t lda{};
  ScalarType a_dtype{};
  ScalarType a_scale_dtype{};
  const void* b{};
  const void* b_scale_ptr{};
  int64_t ldb{};
  ScalarType b_dtype{};
  ScalarType b_scale_dtype{};
  const void* bias_ptr{};
  ScalarType bias_dtype{};
  void* c{};
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -95,12 +95,10 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
          params->a_scale_ptr,
          params->lda,
          params->a_dtype,
          params->a_scale_dtype,
          params->b,
          params->b_scale_ptr,
          params->ldb,
          params->b_dtype,
          params->b_scale_dtype,
          params->bias_ptr,
          params->bias_dtype,
          params->c,
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1,5 +1,4 @@
 #include <cstdint>
 #include <c10/util/typeid.h>
 #include <c10/util/Exception.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
@ -96,33 +95,11 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
 }
 struct cublasCommonArgs {
-  cublasCommonArgs(
+  cublasCommonArgs(const Tensor& mat1, const Tensor& mat2, Tensor& c) {
      const Tensor& mat1,
      const Tensor& mat2,
      Tensor& c,
      const std::optional<Tensor>& scale_a = c10::nullopt,
      const std::optional<Tensor>& scale_b = c10::nullopt,
      const std::optional<Tensor>& scale_result = c10::nullopt) {
    bool transpose_result = false, transpose_mat1 = false, transpose_mat2 = false;
    result = prepare_matrix_for_cublas(c, transpose_result);
    mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
    matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
    // Handle scale tensors if provided
    if (scale_a && scale_b) {
      // By default since we return in row-major we run the gemm
      // as B.T @ A.T, check transpose_result to determine if we flip the scales
      scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
      scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
      scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
      scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
    }
    if (scale_result) {
      scale_result_ptr = scale_result->data_ptr();
      scale_result_dtype = scale_result->scalar_type();
    }
    auto mat1_sizes = mat1.sizes();
    auto mat2_sizes = mat2.sizes();
    if (transpose_result) {
@ -138,23 +115,13 @@ struct cublasCommonArgs {
    lda = mata->stride((transpose_mat1 == transpose_result) ? 1 : 0);
    ldb = matb->stride((transpose_mat2 == transpose_result) ? 1 : 0);
    result_ld = result->stride(transpose_result ? 0 : 1);
-    transa = transpose_mat1 ? mata->is_conj() ? 'c' : 't' : 'n';
+    transa = transpose_mat1 ?  mata->is_conj() ? 'c' : 't' : 'n';
-    transb = transpose_mat2 ? matb->is_conj() ? 'c' : 't' : 'n';
+    transb = transpose_mat2 ?  matb->is_conj() ? 'c' : 't' : 'n';
  }
  // Matrix members
  char transa, transb;
  int64_t m, n, k;
  int64_t lda, ldb, result_ld;
  c10::MaybeOwned<Tensor> mata, matb, result;
  // Scale members
  void* scale_mata_ptr = nullptr;
  void* scale_matb_ptr = nullptr;
  void* scale_result_ptr = nullptr;
  std::optional<c10::ScalarType> scale_mata_dtype;
  std::optional<c10::ScalarType> scale_matb_dtype;
  std::optional<c10::ScalarType> scale_result_dtype;
 };
 } // namespace
@ -936,10 +903,9 @@ static bool _scaled_mm_is_fnuz() {
 namespace{
-enum class ScalingType : std::uint8_t {
+enum class ScalingType {
  TensorWise,
  RowWise,
  BlockWise,
  Error
 };
 /*
@ -947,13 +913,10 @@ enum class ScalingType : std::uint8_t {
 * ---------------------------
 * Conditions and corresponding Scaling Types:
 *
 * - If scale tensors are Float8_e8m0fnu:
 *   - Returns BlockWise (with additional size checks).
 *
 * - If scale_a.numel() == 1 && scale_b.numel() == 1:
 *   - Returns TensorWise.
 *
- * - Else if scale_a.dim() == 2 && scale_a.size(0) == dim_m && scale_b.size(0) == dim_n:
+ * - Else if scale_a.dim() == 1 && scale_a.size(0) == dim_m && scale_b.size(0) == dim_n:
 *   - Returns RowWise.
 *
 * - Otherwise:
@ -966,40 +929,7 @@ ScalingType get_scaling_type(
    const at::Tensor& scale_a,
    const at::Tensor& scale_b,
    int64_t dim_m,
    int64_t dim_k,
    int64_t dim_n) {
  // Check for BlockWise scaling (FP8_E8M0 types)
  if (scale_a.scalar_type() == scale_b.scalar_type() &&
      scale_a.scalar_type() == at::kFloat8_e8m0fnu) {
    constexpr int64_t BLOCK_SIZE_K = 32;
    constexpr int64_t BLOCK_SIZE_MN = 128;
    auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; };
    auto num_k_blocks = ceil_div(dim_k, BLOCK_SIZE_K);
    auto padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4;
    // TODO: We might want to enforce some structure on the shapes of the scale
    // tensors
    // Check expected sizes for block-wise scaling
    auto expected_a_size =
        BLOCK_SIZE_MN * ceil_div(dim_m, BLOCK_SIZE_MN) * padded_num_k_blocks;
    auto expected_b_size =
        BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks;
    TORCH_CHECK(scale_a.numel() == expected_a_size,
                "For BlockWise scaling: Expected scale_a size to be ",
                expected_a_size, " but got ", scale_a.numel());
    TORCH_CHECK(scale_b.numel() == expected_b_size,
                "For BlockWise scaling: Expected scale_b size to be ",
                expected_b_size, " but got ", scale_b.numel());
    TORCH_CHECK(
        scale_a.is_contiguous() && scale_b.is_contiguous(),
        "For BlockWise scaling: Both scale_a and scale_b must be contiguous");
    return ScalingType::BlockWise;
  }
  // Both Per-Tensor and Row-wise scaling expect fp32 tensors
  TORCH_CHECK(
      scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
@ -1097,7 +1027,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
  // Check what type of scaling we are doing based on inputs
-  ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat1.size(1), mat2.size(1));
+  ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat2.size(1));
  TORCH_INTERNAL_ASSERT(scaling_choice != ScalingType::Error, "Scaling type not supported");
  TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
@ -1190,7 +1120,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  }
 #endif
-  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result);
+  cublasCommonArgs args(mat1, mat2, out);
  const auto out_dtype_ = args.result->scalar_type();
  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
@ -1300,7 +1230,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  }
  else
 #endif
- {
+  {
    at::cuda::blas::scaled_gemm(
        args.transa,
        args.transb,
@ -1308,19 +1238,17 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        args.n,
        args.k,
        args.mata->data_ptr(),
-        args.scale_mata_ptr,
+        scale_a.data_ptr(),
        args.lda,
        args.mata->scalar_type(),
        args.scale_mata_dtype.value(),
        args.matb->data_ptr(),
-        args.scale_matb_ptr,
+        scale_b.data_ptr(),
        args.ldb,
        args.matb->scalar_type(),
        args.scale_matb_dtype.value(),
        bias ? bias->data_ptr(): nullptr,
        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
        args.result->data_ptr(),
-        args.scale_result_ptr,
+        scale_result ? scale_result->data_ptr() : nullptr,
        args.result_ld,
        out_dtype_,
        use_fast_accum,
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@ -19,8 +19,7 @@ from torch.testing._internal.common_cuda import (
    SM53OrLater,
    SM89OrLater,
    _get_torch_cuda_version,
-    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_FP8
    PLATFORM_SUPPORTS_MX_GEMM
 )
 from torch.testing._internal.common_device_type import (
    dtypes,
@ -251,7 +250,6 @@ class TestMatmulCuda(TestCase):
 f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
 mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
 if torch.version.hip and 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName:
    e4m3_type = torch.float8_e4m3fnuz
@ -368,79 +366,6 @@ def to_fp8_saturated(
    return x.to(fp8_dtype)
 # copied from https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/mx/to_blocked.py
 def ceil_div(a, b):
    return (a + b - 1) // b
 def to_blocked(input_matrix) -> torch.Tensor:
    """
    Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
    See:
        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
    Args:
        input_matrix: Input tensor of shape (H, W)
    Returns:
        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
    """
    rows, cols = input_matrix.shape
    n_row_blocks = ceil_div(rows, 128)
    n_col_blocks = ceil_div(cols, 4)
    # Calculate the padded shape
    padded_rows = n_row_blocks * 128
    padded_cols = n_col_blocks * 4
    padded = input_matrix
    # Ideally we would use torch.nn.pad but it doesn't support float8_e8m0fnu for now
    if (rows, cols) != (padded_rows, padded_cols):
        padded = torch.zeros((padded_rows, padded_cols), device=input_matrix.device, dtype=input_matrix.dtype)
        padded[:rows, :cols] = input_matrix
    # Rearrange the blocks
    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
    return rearranged.flatten()
 def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    """Computes the error between two tensors in dB.
    For more details see:
        https://en.wikipedia.org/wiki/Signal-to-noise_ratio
    Args:
        x: The original tensor.
        y: The tensor to compare to the original tensor.
    """
    Ps = torch.norm(x)
    Pn = torch.norm(x - y)
    return 20 * torch.log10(Ps / Pn)
 # largest power of 2 representable in `torch.float8_e4m3fn`
 F8E4M3_LARGEST_POW2 = 8
 # max value of `torch.float8_e4m3fn` (448)
 F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
 # exponent bias of `torch.float8_e8m0fnu`
 F8E8M0_EXP_BIAS = 127
 def data_to_mx_scale(x, block_size):
    # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
    # section 6.3, not all edge cases (such as NaN) are handled/tested
    orig_shape = x.shape
    x = x.reshape(-1, block_size)
    max_abs = torch.amax(torch.abs(x), 1)
    largest_p2_lt_max_abs = torch.floor(torch.log2(max_abs))
    scale_e8m0_unbiased = largest_p2_lt_max_abs - F8E4M3_LARGEST_POW2
    scale_e8m0_unbiased = torch.clamp(scale_e8m0_unbiased, -1 * F8E8M0_EXP_BIAS, F8E8M0_EXP_BIAS)
    scale_e8m0_biased = scale_e8m0_unbiased + F8E8M0_EXP_BIAS
    scale_e8m0_biased = scale_e8m0_biased.to(torch.uint8)
    scale_e8m0_biased = scale_e8m0_biased.view(torch.float8_e8m0fnu)
    return scale_e8m0_biased.reshape(orig_shape[0], -1)
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
 class TestFP8MatmulCuda(TestCase):
@ -843,287 +768,6 @@ class TestFP8MatmulCuda(TestCase):
        self.assertEqual(out_dtype, out_fp8.dtype)
        self.assertEqual(out_fp32, out_fp8.to(torch.float))
    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
    @parametrize("test_case_name", [
        "a_eye_b_eye",
        "a_ones_b_ones",
        "a_ones_modified_b_ones",
        "a_ones_b_ones_modified",
        "a_scale_modified_b_ones",
        "a_ones_b_scale_modified",
        "data_random_scales_one",
        "data_random_scales_from_data",
    ])
    @parametrize("fast_accum", [False, True])
    @parametrize("mkn", [
        # Nice shapes
        (128, 128, 128),
        (256, 256, 256),
        (128, 256, 512),
        (256, 512, 128),
        (512, 128, 256),
        # Non block multiples
        (65, 96, 112),
        (197, 224, 272),
        # K not multiple of 32
        (197, 240, 272),
        # Very unbalanced
        (1023, 64, 48),
        (31, 1024, 64),
        (45, 96, 1024),
        # Mixed large and small
        (2, 1024, 128),
        (127, 96, 1024),
        (1025, 128, 96)
    ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
    def test_blockwise_mxfp8_numerics(self, test_case_name, fast_accum, mkn) -> None:
        # inspiration: https://github.com/pytorch/ao/pull/1625
        device = "cuda"
        M, K, N = mkn
        BLOCK_SIZE = 32
        require_exact_match = True
        def ceil_div(a, b):
            return (a + b - 1) // b
        if test_case_name == "a_eye_b_eye":
            if not ((M == K) and (M == N)):
                return unittest.skip("this test is only defined for M == K == N, skipping")
            A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
            B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "a_ones_b_ones":
            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "a_ones_modified_b_ones":
            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            A_ref[1][0:BLOCK_SIZE] = 2
            A[1][0:BLOCK_SIZE] = 2
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "a_ones_b_ones_modified":
            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            B_ref[1][0:BLOCK_SIZE] = 2
            B[1][0:BLOCK_SIZE] = 2
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "a_scale_modified_b_ones":
            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            A_ref[1][0:BLOCK_SIZE] = 4
            A[1][0:BLOCK_SIZE] = 2
            A_scale[1][0] = 2
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "a_ones_b_scale_modified":
            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_ref[1][0:BLOCK_SIZE] = 4
            B[1][0:BLOCK_SIZE] = 2
            B_scale[1][0] = 2
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "data_random_scales_one":
            require_exact_match = False
            # scales all-ones, element data random while being exactly representable in float8_e4m3fn
            # generate integers in [0, 255] and interpret as float8_e4m3fn
            A_ref = torch.randint(0, 255, (M, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
            B_ref = torch.randint(0, 255, (N, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
            # modification: don't allow NaN values
            A_ref[torch.isnan(A_ref)] = 0
            B_ref[torch.isnan(B_ref)] = 0
            A = A_ref.to(torch.float8_e4m3fn)
            B = B_ref.to(torch.float8_e4m3fn)
            A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        elif test_case_name == "data_random_scales_from_data":
            if not K % BLOCK_SIZE == 0:
                return unittest.skip(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
            require_exact_match = False
            # random data, scales from data
            A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
            B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
            # Calculate scales based on the inputs
            A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE)
            B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE)
            max_val = F8E4M3_MAX_VAL
            min_val = -1 * max_val
            A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(M, K)
            A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
            B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
            B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
            # convert to swizzled format
            A_scale = to_blocked(A_scale)
            B_scale = to_blocked(B_scale)
        C_ref = A_ref @ B_ref.t()
        C = torch._scaled_mm(
            A,
            B.t(),
            A_scale,
            B_scale,
            out_dtype=torch.bfloat16,
            use_fast_accum=fast_accum,
        )
        if require_exact_match:
            torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
        else:
            sqnr = compute_error(C_ref, C)
            assert sqnr.item() > 22.0
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
    @skipIfRocm()
    def test_blockwise_mxfloat8_error_messages(self, device) -> None:
        M, K, N = (1024, 512, 2048)
        BLOCK_SIZE_K = 32
        BLOCK_SIZE_MN = 128
        fill_value = 0.5
        x = torch.full((M, K), fill_value, device=device)
        y = torch.full((N, K), fill_value, device=device)
        x_fp8 = x.to(e4m3_type)
        y_fp8 = y.to(e4m3_type).t()
        def ceil_div(a, b):
            return (a + b - 1) // b
        num_k_blocks = ceil_div(K, BLOCK_SIZE_K)
        padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4
        expected_a_size = BLOCK_SIZE_MN * ceil_div(M, BLOCK_SIZE_MN) * padded_num_k_blocks
        expected_b_size = BLOCK_SIZE_MN * ceil_div(N, BLOCK_SIZE_MN) * padded_num_k_blocks
        # Test wrong scale tensor size for scale_a with correct dtype
        with self.assertRaisesRegex(
            RuntimeError,
            re.escape(
                f"For BlockWise scaling: Expected scale_a size to be {expected_a_size} "
                f"but got {expected_a_size - 1}"
            ),
        ):
            incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=torch.float8_e8m0fnu)
            correct_size_b = torch.ones(expected_b_size, device=device, dtype=torch.float8_e8m0fnu)
            torch._scaled_mm(
                x_fp8,
                y_fp8,
                scale_a=incorrect_size_a,
                scale_b=correct_size_b,
                out_dtype=torch.bfloat16,
            )
        # Test wrong scale tensor size for scale_b with correct dtype
        with self.assertRaisesRegex(
            RuntimeError,
            re.escape(
                f"For BlockWise scaling: Expected scale_b size to be {expected_b_size} "
                f"but got {expected_b_size + 1}"
            ),
        ):
            correct_size_a = torch.ones(expected_a_size, device=device, dtype=torch.float8_e8m0fnu)
            incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=torch.float8_e8m0fnu)
            torch._scaled_mm(
                x_fp8,
                y_fp8,
                scale_a=correct_size_a,
                scale_b=incorrect_size_b,
                out_dtype=torch.bfloat16,
            )
        # Test non-contiguous scale tensors with correct dtype
        with self.assertRaisesRegex(
            RuntimeError,
            re.escape(
                "For BlockWise scaling: Both scale_a and scale_b must be contiguous"
            ),
        ):
            non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=torch.float8_e8m0fnu)[::2]
            contiguous_b = torch.ones(expected_b_size, device=device, dtype=torch.float8_e8m0fnu)
            torch._scaled_mm(
                x_fp8,
                y_fp8,
                scale_a=non_contiguous_a,
                scale_b=contiguous_b,
                out_dtype=torch.bfloat16,
            )
@unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
@unittest.skipIf(IS_WINDOWS, "Windows doesn't support CUTLASS extensions")
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@ -32,7 +32,6 @@ SM75OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_devic
 SM80OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0))
 SM89OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9))
 SM90OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0))
 SM100OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (10, 0))
 IS_THOR = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability()[0] == 10
                  and torch.cuda.get_device_capability()[1] > 0)
@ -102,7 +101,6 @@ def evaluate_platform_supports_fp8():
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
 if TEST_NUMBA:
    try: