From fbd08fb358b643386edd4dd28b9c747aab4ba8c1 Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pmagundu@amd.com>
Date: Fri, 4 Nov 2022 04:43:05 +0000
Subject: [PATCH] Introduce TORCH_DISABLE_GPU_ASSERTS (#84190)

- Asserts for CUDA are enabled by default
- Disabled for ROCm by default by setting `TORCH_DISABLE_GPU_ASSERTS` to `ON`
- Can be enabled for ROCm by setting above variable to`OFF` during build or can be forcefully enabled by setting `ROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON`

This is follow up changes as per comment in PR #81790, comment [link](https://github.com/pytorch/pytorch/pull/81790#issuecomment-1215929021)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84190
Approved by: https://github.com/jeffdaily, https://github.com/malfet
---
 CMakeLists.txt             |  1 +
 c10/macros/Macros.h        | 11 ++++++-----
 caffe2/core/macros.h.in    |  2 ++
 cmake/Dependencies.cmake   | 10 ++++++++++
 cmake/Summary.cmake        |  1 +
 cmake/public/LoadHIP.cmake | 16 ----------------
 docs/source/notes/hip.rst  | 11 +++++++++++
 7 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1e0f517faf..6efd3f2df93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,6 +285,7 @@ if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
 endif()
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
+option(TORCH_DISABLE_GPU_ASSERTS "Disable GPU asserts by default" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
 cmake_dependent_option(
   USE_ITT "Use Intel(R) VTune Profiler ITT functionality" ON
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index e77fa0fde2e..31cd2219d10 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -326,9 +326,8 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) ||  \
-    (defined(USE_ROCM) && ROCM_VERSION < 40100) || \
-    (defined(USE_ROCM) && defined(ROCM_DISABLE_GPU_ASSERTS))
+#if defined(__ANDROID__) || defined(__APPLE__) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 40100)
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define SYCL_KERNEL_ASSERT(cond)
@@ -368,7 +367,9 @@ extern SYCL_EXTERNAL void __assert_fail(
     unsigned int line,
     const char* func);
 #else // __SYCL_DEVICE_ONLY__
-#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+#if (                                                                       \
+    defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)) && \
+    !defined(TORCH_DISABLE_GPU_ASSERTS))
 // CUDA supports __assert_fail function which are common for both device
 // and host side code.
 __host__ __device__
@@ -386,7 +387,7 @@ __host__ __device__
         const char* function) throw() __attribute__((__noreturn__));
 
 #if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
-    !defined(ROCM_DISABLE_GPU_ASSERTS)
+    !defined(TORCH_DISABLE_GPU_ASSERTS)
 // ROCm supports __assert_fail only as a device side function.
 __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
     const char* assertion,
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 9c9f7345756..2d9f03e94c0 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -44,6 +44,7 @@ static_assert(
 #cmakedefine CAFFE2_USE_NVTX
 #cmakedefine CAFFE2_USE_ITT
 #cmakedefine CAFFE2_USE_TRT
+#cmakedefine TORCH_DISABLE_GPU_ASSERTS
 
 #ifndef EIGEN_MPL2_ONLY
 #cmakedefine EIGEN_MPL2_ONLY
@@ -85,4 +86,5 @@ static_assert(
   {"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
   {"USE_ITT", "${CAFFE2_USE_ITT}"}, \
   {"USE_TRT", "${CAFFE2_USE_TRT}"}, \
+  {"TORCH_DISABLE_GPU_ASSERTS", "${TORCH_DISABLE_GPU_ASSERTS}"}, \
 }
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e232fcb624c..cf3c2c2caaf 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1248,6 +1248,16 @@ if(ANDROID)
   list(APPEND Caffe2_DEPENDENCY_LIBS log)
 endif()
 
+# ---[ Kernel asserts
+# Kernel asserts are enabled by default for CUDA and disabled for ROCm.
+# For ROCm, it can be enabled by setting ROCM_FORCE_ENABLE_GPU_ASSERTS
+if(USE_ROCM AND ROCM_FORCE_ENABLE_GPU_ASSERTS)
+  message(STATUS "Forcefully enabling kernel asserts on ROCM")
+elseif(USE_ROCM AND NOT ROCM_FORCE_ENABLE_GPU_ASSERTS)
+  message(STATUS "Disabling kernel asserts for ROCm")
+  caffe2_update_option(TORCH_DISABLE_GPU_ASSERTS ON)
+endif()
+
 # ---[ LLVM
 if(USE_LLVM)
   message(STATUS "Looking for LLVM in ${USE_LLVM}")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index fd6444680e2..279d72a41e6 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -199,4 +199,5 @@ function(caffe2_print_configuration_summary)
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
   message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
+  message(STATUS "  TORCH_DISABLE_GPU_ASSERTS : ${TORCH_DISABLE_GPU_ASSERTS}")
 endfunction()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 89a61b62428..b51284115f1 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -143,9 +143,6 @@ message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
 # Add HIP to the CMAKE Module Path
 set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 
-#Disable kernel assert due to performance regression
-set(ROCM_ENABLE_KERNEL_ASSERTS FALSE CACHE BOOL "Kernel asserts are disabled by default for ROCm")
-
 macro(find_package_and_print_version PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" ${ARGN})
   message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
@@ -286,19 +283,6 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
 
-  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
-    if(ROCM_ENABLE_KERNEL_ASSERTS)
-      message("ROCm version >= 4.1; enabling asserts")
-    else()
-      add_definitions(-DROCM_DISABLE_GPU_ASSERTS)
-      message("ROCm version >= 4.1; kernel asserts are disabled")
-    endif()
-  else()
-    # Disable Asserts In Code (Can't use asserts on HIP stack.)
-    add_definitions(-DNDEBUG)
-    message("ROCm version < 4.1; disablng asserts")
-  endif()
-
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)
   else()
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index a9c94e2a4fe..c54e2014897 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -144,3 +144,14 @@ Refer to CUDA Semantics doc
 ---------------------------
 
 For any sections not listed here, please refer to the CUDA semantics doc: :ref:`cuda-semantics`
+
+
+Enabling kernel asserts
+-----------------------
+
+Kernel asserts are supported on ROCm, but they are disabled due to performance overhead. It can be enabled
+by recompiling the PyTorch from source.
+
+Please add below line as an argument to cmake command parameters::
+
+    -DROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON