mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Add new prop to _XpuDevicePropertie for triton gemm optimization (#131738)
# Motivation This PR aims to add new properties to `_XpuDevicePropertie` for triton gemm optimization. # Additional Context `ext_oneapi_supports_cl_extension` is not a ABI-neutral API. It depends on compiler 2025.0. For more details, see https://github.com/intel/llvm/pull/13212 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131738 Approved by: https://github.com/gujinghui
This commit is contained in:
parent
fed6096e73
commit
fbd020fce6
|
|
@ -136,6 +136,22 @@ namespace c10::xpu {
|
|||
/* 64-bit atomic operation is supported on device. */ \
|
||||
_(atomic64)
|
||||
|
||||
#define AT_FORALL_XPU_EXP_CL_ASPECT(_) \
|
||||
/* conversion between single-precision 32-bit floating-point values and \
|
||||
* 16-bit bfloat16 values is supported on device. */ \
|
||||
_(bfloat16_conversions) \
|
||||
\
|
||||
/* specialized hardware to compute MMA is supported on device. */ \
|
||||
_(subgroup_matrix_multiply_accumulate) \
|
||||
\
|
||||
/* specialized hardware to compute MMA for 32-bit floating-point is \
|
||||
* supported on device. */ \
|
||||
_(subgroup_matrix_multiply_accumulate_tensor_float32) \
|
||||
\
|
||||
/* block read operations for efficient matrix multiplication is supported on \
|
||||
* device. */ \
|
||||
_(subgroup_2d_block_io)
|
||||
|
||||
#define _DEFINE_SYCL_PROP(ns, property, member) \
|
||||
ns::property::return_type member;
|
||||
|
||||
|
|
@ -159,6 +175,8 @@ struct C10_XPU_API DeviceProp {
|
|||
AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);
|
||||
|
||||
AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
|
||||
|
||||
AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT);
|
||||
};
|
||||
|
||||
#undef _DEFINE_SYCL_PROP
|
||||
|
|
|
|||
|
|
@ -89,6 +89,10 @@ void initDeviceProperties(DeviceProp* device_prop, int device) {
|
|||
#define ASSIGN_DEVICE_ASPECT(member) \
|
||||
device_prop->has_##member = raw_device.has(sycl::aspect::member);
|
||||
|
||||
#define ASSIGN_EXP_CL_ASPECT(member) \
|
||||
device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
|
||||
"cl_intel_" #member, &cl_version);
|
||||
|
||||
AT_FORALL_XPU_DEVICE_PROPERTIES(ASSIGN_DEVICE_PROP);
|
||||
|
||||
device_prop->platform_name =
|
||||
|
|
@ -97,6 +101,10 @@ void initDeviceProperties(DeviceProp* device_prop, int device) {
|
|||
AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(ASSIGN_EXT_DEVICE_PROP);
|
||||
|
||||
AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
|
||||
|
||||
// TODO: Remove cl_version since it is unnecessary.
|
||||
sycl::ext::oneapi::experimental::cl_version cl_version;
|
||||
AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -110,6 +110,22 @@ class TestXpu(TestCase):
|
|||
self.assertEqual(
|
||||
device_properties.has_atomic64, device_capability["has_atomic64"]
|
||||
)
|
||||
self.assertEqual(
|
||||
device_properties.has_bfloat16_conversions,
|
||||
device_capability["has_bfloat16_conversions"],
|
||||
)
|
||||
self.assertEqual(
|
||||
device_properties.has_subgroup_matrix_multiply_accumulate,
|
||||
device_capability["has_subgroup_matrix_multiply_accumulate"],
|
||||
)
|
||||
self.assertEqual(
|
||||
device_properties.has_subgroup_matrix_multiply_accumulate_tensor_float32,
|
||||
device_capability["has_subgroup_matrix_multiply_accumulate_tensor_float32"],
|
||||
)
|
||||
self.assertEqual(
|
||||
device_properties.has_subgroup_2d_block_io,
|
||||
device_capability["has_subgroup_2d_block_io"],
|
||||
)
|
||||
|
||||
def test_wrong_xpu_fork(self):
|
||||
stderr = TestCase.runWithPytorchAPIUsageStderr(
|
||||
|
|
|
|||
|
|
@ -230,22 +230,33 @@ static void registerXpuDeviceProperties(PyObject* module) {
|
|||
return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
|
||||
};
|
||||
auto m = py::handle(module).cast<py::module>();
|
||||
py::class_<DeviceProp>(m, "_XpuDeviceProperties")
|
||||
.def_readonly("name", &DeviceProp::name)
|
||||
.def_readonly("platform_name", &DeviceProp::platform_name)
|
||||
.def_readonly("vendor", &DeviceProp::vendor)
|
||||
.def_readonly("driver_version", &DeviceProp::driver_version)
|
||||
.def_readonly("version", &DeviceProp::version)
|
||||
|
||||
#define DEFINE_READONLY_MEMBER(member) \
|
||||
def_readonly(#member, &DeviceProp::member)
|
||||
|
||||
#define THXP_FORALL_DEVICE_PROPERTIES(_) \
|
||||
py::class_<DeviceProp>(m, "_XpuDeviceProperties") \
|
||||
._(name) \
|
||||
._(platform_name) \
|
||||
._(vendor) \
|
||||
._(driver_version) \
|
||||
._(version) \
|
||||
._(max_compute_units) \
|
||||
._(gpu_eu_count) \
|
||||
._(max_work_group_size) \
|
||||
._(max_num_sub_groups) \
|
||||
._(sub_group_sizes) \
|
||||
._(has_fp16) \
|
||||
._(has_fp64) \
|
||||
._(has_atomic64) \
|
||||
._(has_bfloat16_conversions) \
|
||||
._(has_subgroup_matrix_multiply_accumulate) \
|
||||
._(has_subgroup_matrix_multiply_accumulate_tensor_float32) \
|
||||
._(has_subgroup_2d_block_io)
|
||||
|
||||
THXP_FORALL_DEVICE_PROPERTIES(DEFINE_READONLY_MEMBER)
|
||||
.def_readonly("total_memory", &DeviceProp::global_mem_size)
|
||||
.def_readonly("max_compute_units", &DeviceProp::max_compute_units)
|
||||
.def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
|
||||
.def_property_readonly("gpu_subslice_count", gpu_subslice_count)
|
||||
.def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
|
||||
.def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
|
||||
.def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
|
||||
.def_readonly("has_fp16", &DeviceProp::has_fp16)
|
||||
.def_readonly("has_fp64", &DeviceProp::has_fp64)
|
||||
.def_readonly("has_atomic64", &DeviceProp::has_atomic64)
|
||||
.def_property_readonly("type", get_device_type)
|
||||
.def(
|
||||
"__repr__",
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user