Add new prop to _XpuDevicePropertie for triton gemm optimization (#131738)

# Motivation This PR aims to add new properties to `_XpuDevicePropertie` for triton gemm optimization. # Additional Context `ext_oneapi_supports_cl_extension` is not a ABI-neutral API. It depends on compiler 2025.0. For more details, see https://github.com/intel/llvm/pull/13212 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131738 Approved by: https://github.com/gujinghui
2025-12-06 12:20:52 +01:00 · 2024-08-16 06:01:33 +00:00 · 2024-08-16 06:01:33 +00:00 · fbd020fce6
commit fbd020fce6
parent fed6096e73
4 changed files with 67 additions and 14 deletions
--- a/c10/xpu/XPUDeviceProp.h
+++ b/c10/xpu/XPUDeviceProp.h
@ -136,6 +136,22 @@ namespace c10::xpu {
  /* 64-bit atomic operation is supported on device. */ \
  _(atomic64)

+#define AT_FORALL_XPU_EXP_CL_ASPECT(_)                                         \
+  /* conversion between single-precision 32-bit floating-point values and      \
+   * 16-bit bfloat16 values is supported on device. */                         \
+  _(bfloat16_conversions)                                                      \
+                                                                               \
+  /* specialized hardware to compute MMA is supported on device. */            \
+  _(subgroup_matrix_multiply_accumulate)                                       \
+                                                                               \
+  /* specialized hardware to compute MMA for 32-bit floating-point is          \
+   * supported on device. */                                                   \
+  _(subgroup_matrix_multiply_accumulate_tensor_float32)                        \
+                                                                               \
+  /* block read operations for efficient matrix multiplication is supported on \
+   * device. */                                                                \
+  _(subgroup_2d_block_io)
+
 #define _DEFINE_SYCL_PROP(ns, property, member) \
  ns::property::return_type member;

@ -159,6 +175,8 @@ struct C10_XPU_API DeviceProp {
  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);

  AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
+
+  AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT);
 };

 #undef _DEFINE_SYCL_PROP
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@ -89,6 +89,10 @@ void initDeviceProperties(DeviceProp* device_prop, int device) {
 #define ASSIGN_DEVICE_ASPECT(member) \
  device_prop->has_##member = raw_device.has(sycl::aspect::member);

+#define ASSIGN_EXP_CL_ASPECT(member)                                       \
+  device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
+      "cl_intel_" #member, &cl_version);
+
  AT_FORALL_XPU_DEVICE_PROPERTIES(ASSIGN_DEVICE_PROP);

  device_prop->platform_name =
@ -97,6 +101,10 @@ void initDeviceProperties(DeviceProp* device_prop, int device) {
  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(ASSIGN_EXT_DEVICE_PROP);

  AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
+
+  // TODO: Remove cl_version since it is unnecessary.
+  sycl::ext::oneapi::experimental::cl_version cl_version;
+  AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
  return;
 }

--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@ -110,6 +110,22 @@ class TestXpu(TestCase):
        self.assertEqual(
            device_properties.has_atomic64, device_capability["has_atomic64"]
        )
+        self.assertEqual(
+            device_properties.has_bfloat16_conversions,
+            device_capability["has_bfloat16_conversions"],
+        )
+        self.assertEqual(
+            device_properties.has_subgroup_matrix_multiply_accumulate,
+            device_capability["has_subgroup_matrix_multiply_accumulate"],
+        )
+        self.assertEqual(
+            device_properties.has_subgroup_matrix_multiply_accumulate_tensor_float32,
+            device_capability["has_subgroup_matrix_multiply_accumulate_tensor_float32"],
+        )
+        self.assertEqual(
+            device_properties.has_subgroup_2d_block_io,
+            device_capability["has_subgroup_2d_block_io"],
+        )

    def test_wrong_xpu_fork(self):
        stderr = TestCase.runWithPytorchAPIUsageStderr(
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@ -230,22 +230,33 @@ static void registerXpuDeviceProperties(PyObject* module) {
    return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
  };
  auto m = py::handle(module).cast<py::module>();
-  py::class_<DeviceProp>(m, "_XpuDeviceProperties")
-      .def_readonly("name", &DeviceProp::name)
-      .def_readonly("platform_name", &DeviceProp::platform_name)
-      .def_readonly("vendor", &DeviceProp::vendor)
-      .def_readonly("driver_version", &DeviceProp::driver_version)
-      .def_readonly("version", &DeviceProp::version)
+
+#define DEFINE_READONLY_MEMBER(member) \
+  def_readonly(#member, &DeviceProp::member)
+
+#define THXP_FORALL_DEVICE_PROPERTIES(_)                         \
+  py::class_<DeviceProp>(m, "_XpuDeviceProperties")              \
+      ._(name)                                                   \
+      ._(platform_name)                                          \
+      ._(vendor)                                                 \
+      ._(driver_version)                                         \
+      ._(version)                                                \
+      ._(max_compute_units)                                      \
+      ._(gpu_eu_count)                                           \
+      ._(max_work_group_size)                                    \
+      ._(max_num_sub_groups)                                     \
+      ._(sub_group_sizes)                                        \
+      ._(has_fp16)                                               \
+      ._(has_fp64)                                               \
+      ._(has_atomic64)                                           \
+      ._(has_bfloat16_conversions)                               \
+      ._(has_subgroup_matrix_multiply_accumulate)                \
+      ._(has_subgroup_matrix_multiply_accumulate_tensor_float32) \
+      ._(has_subgroup_2d_block_io)
+
+  THXP_FORALL_DEVICE_PROPERTIES(DEFINE_READONLY_MEMBER)
      .def_readonly("total_memory", &DeviceProp::global_mem_size)
-      .def_readonly("max_compute_units", &DeviceProp::max_compute_units)
-      .def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
      .def_property_readonly("gpu_subslice_count", gpu_subslice_count)
-      .def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
-      .def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
-      .def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
-      .def_readonly("has_fp16", &DeviceProp::has_fp16)
-      .def_readonly("has_fp64", &DeviceProp::has_fp64)
-      .def_readonly("has_atomic64", &DeviceProp::has_atomic64)
      .def_property_readonly("type", get_device_type)
      .def(
          "__repr__",