[ROCm] missing AT_CUDA_CHECK for cub and SoftMax (#149883)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/149883
Approved by: https://github.com/jeffdaily, https://github.com/Skylion007
This commit is contained in:
Ethan Wee 2025-03-25 23:22:28 +00:00 committed by PyTorch MergeBot
parent de73790fe6
commit 36eb64d60e
4 changed files with 13 additions and 22 deletions

View File

@ -37,11 +37,10 @@
// handle the temporary storage and 'twice' calls for cub API
#define CUB_WRAPPER(func, ...) do { \
size_t temp_storage_bytes = 0; \
func(nullptr, temp_storage_bytes, __VA_ARGS__); \
AT_CUDA_CHECK(func(nullptr, temp_storage_bytes, __VA_ARGS__)); \
auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get(); \
auto temp_storage = caching_allocator.allocate(temp_storage_bytes); \
func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__); \
AT_CUDA_CHECK(cudaGetLastError()); \
AT_CUDA_CHECK(func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__));\
} while (false)
#ifdef USE_ROCM

View File

@ -187,21 +187,21 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(
self_.const_data_ptr<scalar_t>() + idx * chunk_size,
NonZeroOp<scalar_t>());
cub::DeviceReduce::Sum(
AT_CUDA_CHECK(cub::DeviceReduce::Sum(
nullptr,
temp_storage_bytes,
itr,
((int*)num_nonzeros.get()) + idx,
remaining,
stream);
stream));
auto temp_storage = allocator.allocate(temp_storage_bytes);
cub::DeviceReduce::Sum(
AT_CUDA_CHECK(cub::DeviceReduce::Sum(
temp_storage.get(),
temp_storage_bytes,
itr,
((int*)num_nonzeros.get()) + idx,
remaining,
stream);
stream));
}
auto pinned_num_nonzeros_h = at::detail::empty_cpu(
{num_chunks}, /* size */
@ -248,7 +248,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
itr(self_.const_data_ptr<scalar_t>() + idx * chunk_size,
NonZeroOp<scalar_t>());
temp_storage_bytes = 0;
cub::DeviceSelect::Flagged(
AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
nullptr,
temp_storage_bytes,
counting_itr,
@ -256,9 +256,9 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
out_temp.mutable_data_ptr<int64_t>(),
((int*)num_nonzeros.get()) + idx,
remaining,
stream);
stream));
auto temp_storage = allocator.allocate(temp_storage_bytes);
cub::DeviceSelect::Flagged(
AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
temp_storage.get(),
temp_storage_bytes,
counting_itr,
@ -266,7 +266,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
out_temp.mutable_data_ptr<int64_t>() + curr_nonzeros,
((int*)num_nonzeros.get()) + idx,
remaining,
stream);
stream));
curr_nonzeros +=
(int)*(pinned_num_nonzeros_h.const_data_ptr<int>() + idx);
}

View File

@ -56,7 +56,7 @@ bool allContiguous(at::TensorList tensors) {
void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
c10::DeviceIndex curDevice = -1;
c10::cuda::GetDevice(&curDevice);
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
*block = cuda::getApplyBlock();
TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice),
"Could not get grid size for pointwise apply.");

View File

@ -154,16 +154,8 @@ void SpatialSoftMax_getLaunchSizes(
uint32_t block_threads = block.x * block.y;
smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
int max_active_blocks;
#if defined(USE_ROCM) && TORCH_HIP_VERSION < 305
// HIP function signature is not compatible yet.
uint32_t max_blocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
k, block_threads, smem_size);
max_active_blocks = max_blocks;
#else
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
k, block_threads, smem_size);
#endif
AT_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
k, block_threads, smem_size));
max_active_blocks *= at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, inner_size);
}