[ROCm] missing AT_CUDA_CHECK for cub and SoftMax (#149883)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/149883 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007
2025-12-06 12:20:52 +01:00 · 2025-03-25 23:22:28 +00:00 · 2025-03-25 23:22:28 +00:00 · 36eb64d60e
commit 36eb64d60e
parent de73790fe6
4 changed files with 13 additions and 22 deletions
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@ -37,11 +37,10 @@
 // handle the temporary storage and 'twice' calls for cub API
 #define CUB_WRAPPER(func, ...) do {                                       \
  size_t temp_storage_bytes = 0;                                          \
-  func(nullptr, temp_storage_bytes, __VA_ARGS__);                         \
+  AT_CUDA_CHECK(func(nullptr, temp_storage_bytes, __VA_ARGS__));          \
  auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get();    \
  auto temp_storage = caching_allocator.allocate(temp_storage_bytes);     \
-  func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__);              \
-  AT_CUDA_CHECK(cudaGetLastError());                                      \
+  AT_CUDA_CHECK(func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__));\
 } while (false)

 #ifdef USE_ROCM
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -187,21 +187,21 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
    cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(
        self_.const_data_ptr<scalar_t>() + idx * chunk_size,
        NonZeroOp<scalar_t>());
-    cub::DeviceReduce::Sum(
+    AT_CUDA_CHECK(cub::DeviceReduce::Sum(
        nullptr,
        temp_storage_bytes,
        itr,
        ((int*)num_nonzeros.get()) + idx,
        remaining,
-        stream);
+        stream));
    auto temp_storage = allocator.allocate(temp_storage_bytes);
-    cub::DeviceReduce::Sum(
+    AT_CUDA_CHECK(cub::DeviceReduce::Sum(
        temp_storage.get(),
        temp_storage_bytes,
        itr,
        ((int*)num_nonzeros.get()) + idx,
        remaining,
-        stream);
+        stream));
  }
  auto pinned_num_nonzeros_h = at::detail::empty_cpu(
      {num_chunks}, /* size */
@ -248,7 +248,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
          itr(self_.const_data_ptr<scalar_t>() + idx * chunk_size,
              NonZeroOp<scalar_t>());
      temp_storage_bytes = 0;
-      cub::DeviceSelect::Flagged(
+      AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
          nullptr,
          temp_storage_bytes,
          counting_itr,
@ -256,9 +256,9 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
          out_temp.mutable_data_ptr<int64_t>(),
          ((int*)num_nonzeros.get()) + idx,
          remaining,
-          stream);
+          stream));
      auto temp_storage = allocator.allocate(temp_storage_bytes);
-      cub::DeviceSelect::Flagged(
+      AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
          temp_storage.get(),
          temp_storage_bytes,
          counting_itr,
@ -266,7 +266,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
          out_temp.mutable_data_ptr<int64_t>() + curr_nonzeros,
          ((int*)num_nonzeros.get()) + idx,
          remaining,
-          stream);
+          stream));
      curr_nonzeros +=
          (int)*(pinned_num_nonzeros_h.const_data_ptr<int>() + idx);
    }
--- a/aten/src/ATen/native/cuda/RNN.cu
+++ b/aten/src/ATen/native/cuda/RNN.cu
@ -56,7 +56,7 @@ bool allContiguous(at::TensorList tensors) {

 void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
  c10::DeviceIndex curDevice = -1;
-  c10::cuda::GetDevice(&curDevice);
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
  *block = cuda::getApplyBlock();
  TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice),
                        "Could not get grid size for pointwise apply.");
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@ -154,16 +154,8 @@ void SpatialSoftMax_getLaunchSizes(
  uint32_t block_threads = block.x * block.y;
  smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
  int max_active_blocks;
-#if defined(USE_ROCM) && TORCH_HIP_VERSION < 305
-  // HIP function signature is not compatible yet.
-  uint32_t max_blocks;
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
-                                                k, block_threads, smem_size);
-  max_active_blocks = max_blocks;
-#else
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
-                                                k, block_threads, smem_size);
-#endif
+  AT_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
+                                                              k, block_threads, smem_size));
  max_active_blocks *= at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
  grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, inner_size);
 }