diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py index e887367dd8d..32e8049dd7f 100644 --- a/test/inductor/test_cpu_repro.py +++ b/test/inductor/test_cpu_repro.py @@ -4810,22 +4810,6 @@ class CPUReproTests(TestCase): self.common(fn, (x,)) check_metrics_vec_kernel_count(1) - # Tail vectorization case - x = torch.randn((22, 22), dtype=torch.double) - torch._dynamo.reset() - metrics.reset() - with torch.no_grad(): - expected = fn(x) - compiled_fn = torch.compile(fn) - actual, code = run_and_get_cpp_code(compiled_fn, x) - self.assertEqual(expected, actual) - # 1 generated vec kernel - self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1) - # Check that both main and tail loops are vectorized - FileCheck().check_count( - "at::vec::VectorizedN::loadu", 2, exactly=True - ).run(code) - def test_double_reduction_vec(self): def fn(x): return x.sum(dim=1) @@ -4835,22 +4819,6 @@ class CPUReproTests(TestCase): self.common(fn, (x,)) check_metrics_vec_kernel_count(1) - # Tail vectorization case - x = torch.randn((22, 22), dtype=torch.double) - torch._dynamo.reset() - metrics.reset() - with torch.no_grad(): - expected = fn(x) - compiled_fn = torch.compile(fn) - actual, code = run_and_get_cpp_code(compiled_fn, x) - self.assertEqual(expected, actual) - # 1 generated vec kernel - self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1) - # Check that both main and tail loops are vectorized - FileCheck().check_count( - "at::vec::VectorizedN::loadu", 2, exactly=True - ).run(code) - def test_convert_fp32_to_double_vec(self): def fn(x): return x.to(torch.double) @@ -4860,22 +4828,6 @@ class CPUReproTests(TestCase): self.common(fn, (x,)) check_metrics_vec_kernel_count(1) - # Tail vectorization case - x = torch.randn(22, 22) - torch._dynamo.reset() - metrics.reset() - with torch.no_grad(): - expected = fn(x) - compiled_fn = torch.compile(fn) - actual, code = run_and_get_cpp_code(compiled_fn, x) - self.assertEqual(expected, actual) - # 1 generated vec kernel - self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1) - # Check that both main and tail loops are vectorized - FileCheck().check_count( - "at::vec::convert", 2, exactly=True - ).run(code) - def test_convert_double_to_fp32_vec(self): def fn(x): return x.to(torch.float32) @@ -4885,22 +4837,6 @@ class CPUReproTests(TestCase): self.common(fn, (x,)) check_metrics_vec_kernel_count(1) - # Tail vectorization case - x = torch.randn((22, 22), dtype=torch.double) - torch._dynamo.reset() - metrics.reset() - with torch.no_grad(): - expected = fn(x) - compiled_fn = torch.compile(fn) - actual, code = run_and_get_cpp_code(compiled_fn, x) - self.assertEqual(expected, actual) - # 1 generated vec kernel - self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1) - # Check that both main and tail loops are vectorized - FileCheck().check_count( - "at::vec::convert", 2, exactly=True - ).run(code) - def test_no_redundant_to_dtypes_between_fused_scheduler_node(self): # https://github.com/pytorch/pytorch/issues/115260 p0 = torch.tensor([1.0879], dtype=torch.float16) diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py index 6602a821c56..1b8b0a9b9e2 100644 --- a/torch/_inductor/codegen/cpp.py +++ b/torch/_inductor/codegen/cpp.py @@ -159,7 +159,6 @@ VECTORIZABLE_DTYPES: list[torch.dtype] = [ ] MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [ - torch.float64, torch.float, torch.bfloat16, torch.float16,