Revert "[Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324)"

This reverts commit e8cb34dd52. Reverted https://github.com/pytorch/pytorch/pull/163324 on behalf of https://github.com/clee2000 due to seems to have broken some no_gpu tests? test/inductor/test_cpu_repro.py::CPUReproTests::test_double_reduction_vec [GH job link](https://github.com/pytorch/pytorch/actions/runs/18689033019/job/53290772740) [HUD commit link](e9d8973427) ([comment](https://github.com/pytorch/pytorch/pull/163316#issuecomment-3428210509))
2025-12-06 12:20:52 +01:00 · 2025-10-21 17:44:42 +00:00 · 2025-10-21 17:44:42 +00:00 · 78bf6186f2
commit 78bf6186f2
parent c40048472c
2 changed files with 13 additions and 19 deletions
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@ -1543,26 +1543,22 @@ class CPUReproTests(TestCase):
            with config.patch({"cpp.simdlen": None}):
                torch._dynamo.reset()
                metrics.reset()
-                inputs = (
-                    x,
-                    scale,
-                    zero_point,
-                    use_dequant,
-                    use_quant,
-                    quant_min,
-                    quant_max,
-                    dtype,
-                    dequant_out_dtype,
+                self.common(
+                    fn,
+                    (
+                        x,
+                        scale,
+                        zero_point,
+                        use_dequant,
+                        use_quant,
+                        quant_min,
+                        quant_max,
+                        dtype,
+                        dequant_out_dtype,
+                    ),
                )
-                self.common(fn, inputs)
                check_metrics_vec_kernel_count(1)

-                # Check that both main and tail loops are vectorized
-                if dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-                    compiled_fn = torch.compile(fn)
-                    _, code = run_and_get_cpp_code(compiled_fn, *inputs)
-                    FileCheck().check_count("loadu", 2, exactly=True).run(code)
-
    @requires_vectorization
    def test_dequant_quant_lowering_uint8(self):
        self._test_dequant_quant_lowering_helper(torch.uint8)
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@ -165,8 +165,6 @@ MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
    torch.float16,
    torch.uint8,
    torch.int8,
-    torch.float8_e4m3fn,
-    torch.float8_e5m2,
 ]