mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[ROCm] missing AT_CUDA_CHECK for cub and SoftMax (#149883)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/149883 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007
This commit is contained in:
parent
de73790fe6
commit
36eb64d60e
|
|
@ -37,11 +37,10 @@
|
|||
// handle the temporary storage and 'twice' calls for cub API
|
||||
#define CUB_WRAPPER(func, ...) do { \
|
||||
size_t temp_storage_bytes = 0; \
|
||||
func(nullptr, temp_storage_bytes, __VA_ARGS__); \
|
||||
AT_CUDA_CHECK(func(nullptr, temp_storage_bytes, __VA_ARGS__)); \
|
||||
auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get(); \
|
||||
auto temp_storage = caching_allocator.allocate(temp_storage_bytes); \
|
||||
func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__); \
|
||||
AT_CUDA_CHECK(cudaGetLastError()); \
|
||||
AT_CUDA_CHECK(func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__));\
|
||||
} while (false)
|
||||
|
||||
#ifdef USE_ROCM
|
||||
|
|
|
|||
|
|
@ -187,21 +187,21 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
|
|||
cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(
|
||||
self_.const_data_ptr<scalar_t>() + idx * chunk_size,
|
||||
NonZeroOp<scalar_t>());
|
||||
cub::DeviceReduce::Sum(
|
||||
AT_CUDA_CHECK(cub::DeviceReduce::Sum(
|
||||
nullptr,
|
||||
temp_storage_bytes,
|
||||
itr,
|
||||
((int*)num_nonzeros.get()) + idx,
|
||||
remaining,
|
||||
stream);
|
||||
stream));
|
||||
auto temp_storage = allocator.allocate(temp_storage_bytes);
|
||||
cub::DeviceReduce::Sum(
|
||||
AT_CUDA_CHECK(cub::DeviceReduce::Sum(
|
||||
temp_storage.get(),
|
||||
temp_storage_bytes,
|
||||
itr,
|
||||
((int*)num_nonzeros.get()) + idx,
|
||||
remaining,
|
||||
stream);
|
||||
stream));
|
||||
}
|
||||
auto pinned_num_nonzeros_h = at::detail::empty_cpu(
|
||||
{num_chunks}, /* size */
|
||||
|
|
@ -248,7 +248,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
|
|||
itr(self_.const_data_ptr<scalar_t>() + idx * chunk_size,
|
||||
NonZeroOp<scalar_t>());
|
||||
temp_storage_bytes = 0;
|
||||
cub::DeviceSelect::Flagged(
|
||||
AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
|
||||
nullptr,
|
||||
temp_storage_bytes,
|
||||
counting_itr,
|
||||
|
|
@ -256,9 +256,9 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
|
|||
out_temp.mutable_data_ptr<int64_t>(),
|
||||
((int*)num_nonzeros.get()) + idx,
|
||||
remaining,
|
||||
stream);
|
||||
stream));
|
||||
auto temp_storage = allocator.allocate(temp_storage_bytes);
|
||||
cub::DeviceSelect::Flagged(
|
||||
AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
|
||||
temp_storage.get(),
|
||||
temp_storage_bytes,
|
||||
counting_itr,
|
||||
|
|
@ -266,7 +266,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
|
|||
out_temp.mutable_data_ptr<int64_t>() + curr_nonzeros,
|
||||
((int*)num_nonzeros.get()) + idx,
|
||||
remaining,
|
||||
stream);
|
||||
stream));
|
||||
curr_nonzeros +=
|
||||
(int)*(pinned_num_nonzeros_h.const_data_ptr<int>() + idx);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ bool allContiguous(at::TensorList tensors) {
|
|||
|
||||
void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
|
||||
c10::DeviceIndex curDevice = -1;
|
||||
c10::cuda::GetDevice(&curDevice);
|
||||
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
|
||||
*block = cuda::getApplyBlock();
|
||||
TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice),
|
||||
"Could not get grid size for pointwise apply.");
|
||||
|
|
|
|||
|
|
@ -154,16 +154,8 @@ void SpatialSoftMax_getLaunchSizes(
|
|||
uint32_t block_threads = block.x * block.y;
|
||||
smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
|
||||
int max_active_blocks;
|
||||
#if defined(USE_ROCM) && TORCH_HIP_VERSION < 305
|
||||
// HIP function signature is not compatible yet.
|
||||
uint32_t max_blocks;
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
|
||||
k, block_threads, smem_size);
|
||||
max_active_blocks = max_blocks;
|
||||
#else
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
|
||||
k, block_threads, smem_size);
|
||||
#endif
|
||||
AT_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
|
||||
k, block_threads, smem_size));
|
||||
max_active_blocks *= at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
|
||||
grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, inner_size);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user