Pass Werror to CUDA host compiler (#130213)

Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/130213 Approved by: https://github.com/ezyang
2025-12-06 12:20:52 +01:00 · 2024-09-21 08:01:06 +00:00 · 2024-09-21 08:01:06 +00:00 · c459430558
commit c459430558
parent e18439113e
2 changed files with 14 additions and 6 deletions
--- a/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
@ -55,7 +55,7 @@ void gemm_grouped_cuda_internal(
    const std::vector<scalar_t*>& bptr,
    const std::vector<scalar_t*>& dptr,
    const std::vector<cutlass::gemm::GemmCoord>& gemm_sizes,
-    const int problem_count,
+    const int64_t problem_count,
    at::Device& device) {
  using Element = scalar_t;
  using ElementAcc = float;
@ -183,7 +183,7 @@ bool group_gemm_dispatch(
    const std::vector<int64_t>& lda,
    const std::vector<int64_t>& ldb,
    const std::vector<int64_t>& ldd,
-    std::vector<cutlass::gemm::GemmCoord> gemm_sizes,
+    const std::vector<cutlass::gemm::GemmCoord>& gemm_sizes,
    int64_t ntensors) {
  return false;
 }
@ -197,7 +197,7 @@ bool group_gemm_dispatch(
    const std::vector<int64_t>& lda,
    const std::vector<int64_t>& ldb,
    const std::vector<int64_t>& ldd,
-    std::vector<cutlass::gemm::GemmCoord> gemm_sizes,
+    const std::vector<cutlass::gemm::GemmCoord>& gemm_sizes,
    int64_t ntensors) {

  gemm_grouped_cuda_internal<
@ -223,7 +223,7 @@ bool group_gemm_dispatch(
    const std::vector<int64_t>& lda,
    const std::vector<int64_t>& ldb,
    const std::vector<int64_t>& ldd,
-    std::vector<cutlass::gemm::GemmCoord> gemm_sizes,
+    const std::vector<cutlass::gemm::GemmCoord>& gemm_sizes,
    int64_t ntensors) {

  // Check alignment
@ -357,8 +357,7 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
          const int64_t &self_size1 = self_shape[1];
          const int64_t &mat2_size0 = mat2_shape[0];
          const int64_t &mat2_size1 = mat2_shape[1];
-          gemm_sizes.push_back(
-              cutlass::gemm::GemmCoord(self_size0, mat2_size1, self_size1));
+          gemm_sizes.emplace_back(self_size0, mat2_size1, self_size1);
          aptr[i] = self_buffer.data_ptr<scalar_t>() + get_offset_for_index(self, i);
          bptr[i] = mat2_buffer.data_ptr<scalar_t>() + get_offset_for_index(mat2, i);
          dptr[i] = out_buffer.data_ptr<scalar_t>() + out_offsets_ptr[i];
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1380,6 +1380,15 @@ if(NOT INTERN_BUILD_MOBILE)
    # we want to respect the standard, and we are bored of those **** .
    add_definitions(-D_CRT_SECURE_NO_DEPRECATE=1)
    string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler=/wd4819,/wd4503,/wd4190,/wd4244,/wd4251,/wd4275,/wd4522")
+  else()
+    if(WERROR)
+      if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 13)
+        string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler -Wno-dangling-reference ")
+      endif()
+      if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 13))
+        string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler -Werror -Xcompiler -Wno-error=sign-compare ")
+      endif()
+    endif()
  endif()

  string(APPEND CMAKE_CUDA_FLAGS " -Wno-deprecated-gpu-targets --expt-extended-lambda")