Add new prop to _XpuDevicePropertie for triton gemm optimization (#131738)

# Motivation
This PR aims to add new properties to `_XpuDevicePropertie` for triton gemm optimization.

# Additional Context
`ext_oneapi_supports_cl_extension` is not a ABI-neutral API. It depends on compiler 2025.0. For more details, see https://github.com/intel/llvm/pull/13212

Pull Request resolved: https://github.com/pytorch/pytorch/pull/131738
Approved by: https://github.com/gujinghui
This commit is contained in:
Yu, Guangye 2024-08-16 06:01:33 +00:00 committed by PyTorch MergeBot
parent fed6096e73
commit fbd020fce6
4 changed files with 67 additions and 14 deletions

View File

@ -136,6 +136,22 @@ namespace c10::xpu {
/* 64-bit atomic operation is supported on device. */ \
_(atomic64)
#define AT_FORALL_XPU_EXP_CL_ASPECT(_) \
/* conversion between single-precision 32-bit floating-point values and \
* 16-bit bfloat16 values is supported on device. */ \
_(bfloat16_conversions) \
\
/* specialized hardware to compute MMA is supported on device. */ \
_(subgroup_matrix_multiply_accumulate) \
\
/* specialized hardware to compute MMA for 32-bit floating-point is \
* supported on device. */ \
_(subgroup_matrix_multiply_accumulate_tensor_float32) \
\
/* block read operations for efficient matrix multiplication is supported on \
* device. */ \
_(subgroup_2d_block_io)
#define _DEFINE_SYCL_PROP(ns, property, member) \
ns::property::return_type member;
@ -159,6 +175,8 @@ struct C10_XPU_API DeviceProp {
AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);
AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT);
};
#undef _DEFINE_SYCL_PROP

View File

@ -89,6 +89,10 @@ void initDeviceProperties(DeviceProp* device_prop, int device) {
#define ASSIGN_DEVICE_ASPECT(member) \
device_prop->has_##member = raw_device.has(sycl::aspect::member);
#define ASSIGN_EXP_CL_ASPECT(member) \
device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
"cl_intel_" #member, &cl_version);
AT_FORALL_XPU_DEVICE_PROPERTIES(ASSIGN_DEVICE_PROP);
device_prop->platform_name =
@ -97,6 +101,10 @@ void initDeviceProperties(DeviceProp* device_prop, int device) {
AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(ASSIGN_EXT_DEVICE_PROP);
AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
// TODO: Remove cl_version since it is unnecessary.
sycl::ext::oneapi::experimental::cl_version cl_version;
AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
return;
}

View File

@ -110,6 +110,22 @@ class TestXpu(TestCase):
self.assertEqual(
device_properties.has_atomic64, device_capability["has_atomic64"]
)
self.assertEqual(
device_properties.has_bfloat16_conversions,
device_capability["has_bfloat16_conversions"],
)
self.assertEqual(
device_properties.has_subgroup_matrix_multiply_accumulate,
device_capability["has_subgroup_matrix_multiply_accumulate"],
)
self.assertEqual(
device_properties.has_subgroup_matrix_multiply_accumulate_tensor_float32,
device_capability["has_subgroup_matrix_multiply_accumulate_tensor_float32"],
)
self.assertEqual(
device_properties.has_subgroup_2d_block_io,
device_capability["has_subgroup_2d_block_io"],
)
def test_wrong_xpu_fork(self):
stderr = TestCase.runWithPytorchAPIUsageStderr(

View File

@ -230,22 +230,33 @@ static void registerXpuDeviceProperties(PyObject* module) {
return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
};
auto m = py::handle(module).cast<py::module>();
py::class_<DeviceProp>(m, "_XpuDeviceProperties")
.def_readonly("name", &DeviceProp::name)
.def_readonly("platform_name", &DeviceProp::platform_name)
.def_readonly("vendor", &DeviceProp::vendor)
.def_readonly("driver_version", &DeviceProp::driver_version)
.def_readonly("version", &DeviceProp::version)
#define DEFINE_READONLY_MEMBER(member) \
def_readonly(#member, &DeviceProp::member)
#define THXP_FORALL_DEVICE_PROPERTIES(_) \
py::class_<DeviceProp>(m, "_XpuDeviceProperties") \
._(name) \
._(platform_name) \
._(vendor) \
._(driver_version) \
._(version) \
._(max_compute_units) \
._(gpu_eu_count) \
._(max_work_group_size) \
._(max_num_sub_groups) \
._(sub_group_sizes) \
._(has_fp16) \
._(has_fp64) \
._(has_atomic64) \
._(has_bfloat16_conversions) \
._(has_subgroup_matrix_multiply_accumulate) \
._(has_subgroup_matrix_multiply_accumulate_tensor_float32) \
._(has_subgroup_2d_block_io)
THXP_FORALL_DEVICE_PROPERTIES(DEFINE_READONLY_MEMBER)
.def_readonly("total_memory", &DeviceProp::global_mem_size)
.def_readonly("max_compute_units", &DeviceProp::max_compute_units)
.def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
.def_property_readonly("gpu_subslice_count", gpu_subslice_count)
.def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
.def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
.def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
.def_readonly("has_fp16", &DeviceProp::has_fp16)
.def_readonly("has_fp64", &DeviceProp::has_fp64)
.def_readonly("has_atomic64", &DeviceProp::has_atomic64)
.def_property_readonly("type", get_device_type)
.def(
"__repr__",