[Intel GPU] xpu-ops codegen via backend whitelist (#130082)

# Motivation This PR intends to enhance the codegen to allow generate codes for XPU backend. XPU operators need be registered in an hand-written way currently. Developers have no chance to take the advantage of shared code to handle tensor meta setting (like strides, proxy output, structured kernels). Manually porting code is erro-prone and may lead to high maintaining efforts. We utilize the backend_whitelist argument in `gen.py` to generate XPU needed headers and source codes. # Usage XPU ops lie in `third_pary/torch-xpu-ops`, the codegen process is triggered before the complation of `torch-xpu-ops` We use the following commands to generate XPU operators ` python -m torchgen.gen --source-path path/to/yaml/of/xpu --install-dir build/xpu --per-operator-headers --static-dispatch-backend --backend-whitelist=XPU` The diff lies at `backend-whitelist=XPU`. The backend-whitelist key is an existent argument in torchgen. The input of `gen.py` are code templates and operators yaml. We share the same templates in `aten`. A simplified yaml lies in `third_party/torch-xpu-ops`, which only includes the supported xpu operators. This yaml is a copy-and-modify of `native_functions.yaml`. No extra entry is added, the format is same as the one in `aten` # Result All operators headers are generated in `build/xpu/ATen/ops` independently, which would not affect operators declared/defined by CPU/CUDA or any other backend. XPU operators only include headers in this folder. # Verification * In `third-party/torch-xpu-ops`, we migrate all supported kernels to structured kernels style, where they are registered through `REGISTER_XPU_DISPATCH` or `TORCH_IMPL_FUNC`, and we have UT verification based on `test_ops.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/130082 Approved by: https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/atalman ghstack dependencies: #130019
2025-12-06 12:20:52 +01:00 · 2024-07-26 07:05:35 +00:00 · 2024-07-26 07:05:35 +00:00 · fe4f8e97cd
commit fe4f8e97cd
parent aec8bc5e4c
3 changed files with 14 additions and 1 deletions
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1047,6 +1047,7 @@ if(USE_XPU)
  # 1. Sources in torch-xpu-ops depend on generated ATen headers.
  # 2. Using add_custom_command in torch-xpu-ops to define sycl device sources
  #    compilation. add_custom_command requires an explicit dependency.
+  list(APPEND ${Caffe2_XPU_INCLUDE} ${TORCH_XPU_OPS_DIR}/src/ATen/)
  set(TORCH_XPU_OPS_PYTORCH_DEPS ATEN_CPU_FILES_GEN_TARGET)

  add_subdirectory(${TORCH_ROOT}/third_party/torch-xpu-ops
--- a/torchgen/dest/register_dispatch_key.py
+++ b/torchgen/dest/register_dispatch_key.py
@ -62,6 +62,9 @@ def gen_registration_headers(
            headers.append("#include <ATen/cuda/EmptyTensor.h>")
    elif backend_index.dispatch_key == DispatchKey.MPS:
        headers.append("#include <ATen/mps/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.XPU:
+        # XPU specific, this header resides in third_party/torch-xpu-ops
+        headers.append("#include <ATen/xpu/EmptyTensor.h>")
    elif per_operator_headers:
        headers += [
            "#include <ATen/ops/empty.h>",
@ -87,6 +90,7 @@ def gen_empty_impl_names(
        DispatchKey.CPU,
        DispatchKey.CUDA,
        DispatchKey.MPS,
+        DispatchKey.XPU,
    ):
        dispatch = str(backend_index.dispatch_key).lower()
        empty_impl = f"at::detail::empty_{dispatch}"
@ -95,6 +99,7 @@ def gen_empty_impl_names(
        DispatchKey.CompositeExplicitAutogradNonFunctional,
        DispatchKey.QuantizedCPU,
        DispatchKey.QuantizedCUDA,
+        DispatchKey.XPU,
    ):
        empty_impl = "at::empty"
        empty_strided_impl = "at::empty_strided"
@ -639,6 +644,7 @@ if (C10_UNLIKELY(maybe_proxy.has_value())) {
                DispatchKey.CPU,
                DispatchKey.CUDA,
                DispatchKey.MPS,
+                DispatchKey.XPU,
                DispatchKey.CompositeExplicitAutogradNonFunctional,
            )
            return f"""{maybe_set_guard_line}
--- a/torchgen/model.py
+++ b/torchgen/model.py
@ -262,7 +262,12 @@ for fk in FUNCTIONALITY_KEYS:
            )


-STRUCTURED_DISPATCH_KEYS = {DispatchKey.MPS, DispatchKey.CUDA, DispatchKey.CPU}
+STRUCTURED_DISPATCH_KEYS = {
+    DispatchKey.MPS,
+    DispatchKey.CUDA,
+    DispatchKey.CPU,
+    DispatchKey.XPU,
+}
 UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU}

 # Set of supported dispatch keys
@ -273,6 +278,7 @@ dispatch_keys = [
    DispatchKey.MkldnnCPU,
    DispatchKey.CUDA,
    DispatchKey.MPS,
+    DispatchKey.XPU,
    DispatchKey.SparseCUDA,
    DispatchKey.SparseCsrCUDA,
    DispatchKey.QuantizedCPU,