[AMD] TunableOp take priority over DISABLE_ADDMM_HIP_LT (#124161)

Summary: It seems super confusing that if we set DISABLE_ADDMM_HIP_LT + PYTORCH_TUNABLEOP_ENABLED, the former takes priority. This is because the former goes through the gemm_and_bias and tunable op is integrated with gemm path. Before we can integrate tunable op with gemm_and_bias, we'll probably just let tunable op takes priority Test Plan: Run a simple linear program and verified. Differential Revision: D56183954 Pull Request resolved: https://github.com/pytorch/pytorch/pull/124161 Approved by: https://github.com/jeffdaily, https://github.com/nmacchioni
2025-12-06 12:20:52 +01:00 · 2024-04-19 19:08:06 +00:00 · 2024-04-19 19:08:06 +00:00 · 661fd23640
commit 661fd23640
parent f87c788a34
1 changed files with 7 additions and 0 deletions
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -6,6 +6,7 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/tunable/Tunable.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>

@ -174,6 +175,12 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
 static bool getDisableAddmmCudaLt() {
    static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
 #ifdef USE_ROCM
+    // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
+    // note the current tunable op is not the hipblaslt path (gemm_and_bias)
+    auto tuning_ctx = at::cuda::tunable::getTuningContext();
+    if (tuning_ctx->IsTunableOpEnabled()) {
+      return true;
+    }
    // allow both CUDA and HIP env var names for ROCm builds
    // also, current default for ROCm builds is disable by default
    if (env_value == nullptr) {