fix cuDNN SDPA meta registration (#148921)

Update `cuDNN SDPA` meta registration to matching memory layout behavior in: https://github.com/pytorch/pytorch/pull/138354 Pull Request resolved: https://github.com/pytorch/pytorch/pull/148921 Approved by: https://github.com/drisspg, https://github.com/jbschlosser
2025-12-06 12:20:52 +01:00 · 2025-03-13 07:33:12 +00:00 · 2025-03-13 07:33:12 +00:00 · ec93aa7f84
commit ec93aa7f84
parent 2a7d583452
1 changed files with 14 additions and 1 deletions
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@ -5583,7 +5583,20 @@ def meta__scaled_dot_product_cudnn_attention(
    S_KV = key.size(2)
    D_V = value.size(-1)

-    res = torch.empty((B, H, S_Q, D_V), dtype=query.dtype, device=query.device)
+    res_shape = (B, H, S_Q, D_V)
+    if tuple(query.shape) == res_shape:
+        query_t = query.transpose(1, 2)
+        res = torch.empty_like(query_t).transpose(1, 2)
+    else:
+        dim_order = sorted(
+            [0, 1, 2, 3], key=lambda idx: query.stride()[idx], reverse=True
+        )
+        permuted_shape = [res_shape[idx] for idx in dim_order]
+        final_permute = [dim_order.index(i) for i in range(len(dim_order))]
+        res = torch.empty(
+            permuted_shape, dtype=query.dtype, device=query.device
+        ).permute(final_permute)
+
    logsum_exp = torch.empty(
        (B, H, S_Q),
        dtype=torch.float,