diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index fdc95e54600..37c8f3a364f 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -90,8 +90,13 @@ public: IntList output_sizes) { // signal sizes +#ifdef __HIP_PLATFORM_HCC__ + std::vector signal_sizes(checked_signal_sizes.begin(), + checked_signal_sizes.end()); +#else std::vector signal_sizes(checked_signal_sizes.begin(), checked_signal_sizes.end()); +#endif // input batch size long long int batch = input.size(0); @@ -149,7 +154,11 @@ public: // TODO: Figure out why windows fails to compile // at::optional> inembed_opt = at::nullopt; // Then move the following to a helper function. +#ifdef __HIP_PLATFORM_HCC__ + std::vector inembed(signal_ndim); +#else std::vector inembed(signal_ndim); +#endif if (!clone_input) { auto istrides = input.strides(); auto last_istride = istrides[signal_ndim]; @@ -192,6 +201,37 @@ public: inembed.begin()); // begin of output } +#ifdef __HIP_PLATFORM_HCC__ + + hipfftType exec_type; + if (input.type().scalarType() == ScalarType::Float) { + if (complex_input && complex_output) { + exec_type = HIPFFT_C2C; + } else if (complex_input && !complex_output) { + exec_type = HIPFFT_C2R; + } else if (!complex_input && complex_output) { + exec_type = HIPFFT_R2C; + } else { + throw std::runtime_error("hipFFT doesn't support r2r (float)"); + } + } else if (input.type().scalarType() == ScalarType::Double) { + if (complex_input && complex_output) { + exec_type = HIPFFT_Z2Z; + } else if (complex_input && !complex_output) { + exec_type = HIPFFT_Z2D; + } else if (!complex_input && complex_output) { + exec_type = HIPFFT_D2Z; + } else { + throw std::runtime_error("hipFFT doesn't support r2r (double)"); + } + } else { + std::ostringstream ss; + ss << "hipFFT doesn't support tensor of type: " + << at::toString(input.type().scalarType()); + throw std::runtime_error(ss.str()); + } + +#else cudaDataType itype, otype, exec_type; if (input.type().scalarType() == ScalarType::Float) { itype = complex_input ? CUDA_C_32F : CUDA_R_32F; @@ -211,6 +251,7 @@ public: << at::toString(input.type().scalarType()); throw std::runtime_error(ss.str()); } +#endif // create plan auto raw_plan_ptr = new cufftHandle(); @@ -229,10 +270,17 @@ public: // by assuming base_istride = base_ostride = 1. // // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. +#ifdef __HIP_PLATFORM_HCC__ + CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, + exec_type, batch, &ws_size_t)); +#else CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, batch, &ws_size_t, exec_type)); +#endif } else { // set idist (stride at batch dim) // set base_istride (stride at innermost dim of signal) @@ -254,6 +302,18 @@ public: } // set odist, onembed, base_ostride +#ifdef __HIP_PLATFORM_HCC__ + int odist = at::prod_intlist(output_sizes.slice(1, signal_ndim)); + std::vector onembed(output_sizes.data() + 1, output_sizes.data() + signal_ndim + 1); + int base_ostride = 1; + + int istride = base_istride; + int iidist = idist; + CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + inembed.data(), istride, iidist, + onembed.data(), base_ostride, odist, + exec_type, batch, &ws_size_t)); +#else long long int odist = at::prod_intlist(output_sizes.slice(1, signal_ndim)); std::vector onembed(output_sizes.data() + 1, output_sizes.data() + signal_ndim + 1); long long int base_ostride = 1; @@ -262,11 +322,16 @@ public: inembed.data(), base_istride, idist, itype, onembed.data(), base_ostride, odist, otype, batch, &ws_size_t, exec_type)); - } +#endif + } ws_size = static_cast(ws_size_t); } +#ifdef __HIP_PLATFORM_HCC__ + cufftHandle &plan() const { return *plan_ptr.get(); } +#else const cufftHandle &plan() const { return *plan_ptr.get(); } +#endif bool should_clone_input() const { return clone_input; } diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h index 5edfcbc1354..8046602eb59 100644 --- a/aten/src/ATen/native/cuda/CuFFTUtils.h +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h @@ -49,8 +49,10 @@ static inline std::string _cudaGetErrorEnum(cufftResult error) return "CUFFT_NO_WORKSPACE"; case CUFFT_NOT_IMPLEMENTED: return "CUFFT_NOT_IMPLEMENTED"; +#ifndef __HIP_PLATFORM_HCC__ case CUFFT_LICENSE_ERROR: return "CUFFT_LICENSE_ERROR"; +#endif case CUFFT_NOT_SUPPORTED: return "CUFFT_NOT_SUPPORTED"; default: diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index 11c648fd6ae..00057d403dc 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -189,8 +189,45 @@ static inline Tensor _run_cufft( CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr())); // run +#ifdef __HIP_PLATFORM_HCC__ + if (input.type().scalarType() == ScalarType::Float) { + if (complex_input && complex_output) { + CUFFT_CHECK(hipfftExecC2C(plan, static_cast(input.data_ptr()), + static_cast(output.data_ptr()), + inverse ? HIPFFT_BACKWARD : HIPFFT_FORWARD)); + } else if (complex_input && !complex_output) { + CUFFT_CHECK(hipfftExecC2R(plan, static_cast(input.data_ptr()), + static_cast(output.data_ptr()))); + } else if (!complex_input && complex_output) { + CUFFT_CHECK(hipfftExecR2C(plan, static_cast(input.data_ptr()), + static_cast(output.data_ptr()))); + } else { + throw std::runtime_error("hipFFT doesn't support r2r (float)"); + } + } else if (input.type().scalarType() == ScalarType::Double) { + if (complex_input && complex_output) { + CUFFT_CHECK(hipfftExecZ2Z(plan, static_cast(input.data_ptr()), + static_cast(output.data_ptr()), + inverse ? HIPFFT_BACKWARD : HIPFFT_FORWARD)); + } else if (complex_input && !complex_output) { + CUFFT_CHECK(hipfftExecZ2D(plan, static_cast(input.data_ptr()), + static_cast(output.data_ptr()))); + } else if (!complex_input && complex_output) { + CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(input.data_ptr()), + static_cast(output.data_ptr()))); + } else { + throw std::runtime_error("hipFFT doesn't support r2r (double)"); + } + } else { + std::ostringstream ss; + ss << "hipFFT doesn't support tensor of type: " + << at::toString(input.type().scalarType()); + throw std::runtime_error(ss.str()); + } +#else CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(), inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); +#endif // rescale if needed by normalized flag or inverse transform auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1]; diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index 485b5744f15..8fec96dde6f 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -138,8 +138,10 @@ static inline __device__ void atomicAdd(double *address, double val) { } while (assumed != old); } #elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__) +#if defined(__HIP_PLATFORM_HCC__) && __hcc_workweek__ < 18312 // This needs to be defined for the host side pass static inline __device__ void atomicAdd(double *address, double val) { } #endif +#endif #endif // THC_ATOMICS_INC diff --git a/aten/src/THC/THCScanUtils.cuh b/aten/src/THC/THCScanUtils.cuh index ef7c297f41c..d5542383560 100644 --- a/aten/src/THC/THCScanUtils.cuh +++ b/aten/src/THC/THCScanUtils.cuh @@ -4,6 +4,12 @@ #include "THCAsmUtils.cuh" #include "THCDeviceUtils.cuh" +#if defined(__HIP_PLATFORM_HCC__) +#define SCAN_UTILS_WARP_SIZE 64 +#else +#define SCAN_UTILS_WARP_SIZE 32 +#endif + // Collection of in-kernel scan / prefix sum utilities // Inclusive Scan via an upsweep/downsweep mechanism. Assumes: @@ -157,7 +163,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti T index = __popc(getLaneMaskLe() & vote); T carry = __popc(vote); - int warp = threadIdx.x / 32; + int warp = threadIdx.x / SCAN_UTILS_WARP_SIZE; // Per each warp, write out a value if (getLaneId() == 0) { @@ -170,7 +176,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti // warp shuffle scan for CC 3.0+ if (threadIdx.x == 0) { int current = 0; - for (int i = 0; i < blockDim.x / 32; ++i) { + for (int i = 0; i < blockDim.x / SCAN_UTILS_WARP_SIZE; ++i) { T v = smem[i]; smem[i] = binop(smem[i], current); current = binop(current, v); @@ -201,11 +207,13 @@ __device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, Bi *out -= (T) in; // The outgoing carry for all threads is the last warp's sum - *carry = smem[(blockDim.x / 32) - 1]; + *carry = smem[(blockDim.x / SCAN_UTILS_WARP_SIZE) - 1]; if (KillWARDependency) { __syncthreads(); } } +#undef SCAN_UTILS_WARP_SIZE + #endif // THC_SCAN_UTILS_INC diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh index 2ddff119c64..4f7a6b8c697 100644 --- a/aten/src/THC/THCTensorTopK.cuh +++ b/aten/src/THC/THCTensorTopK.cuh @@ -213,7 +213,11 @@ __device__ DataType findPattern(DataType* smem, IndexType withinSliceStride, BitDataType desired, BitDataType desiredMask) { +#ifdef __HIP_PLATFORM_HCC__ + if (threadIdx.x < 64) { +#else if (threadIdx.x < 32) { +#endif smem[threadIdx.x] = ScalarConvert::to(0); } __syncthreads(); @@ -366,7 +370,11 @@ __global__ void gatherTopK(TensorInfo input, IndexType indicesWithinSliceStride) { // Indices are limited to integer fp precision, so counts can fit in // int32, regardless of IndexType +#ifdef __HIP_PLATFORM_HCC__ + __shared__ int smem[64]; +#else __shared__ int smem[32]; // one per each warp, up to warp limit +#endif IndexType slice = getLinearBlockId(); if (slice >= numInputSlices) { diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu index 284b9331230..9fdb13f1469 100644 --- a/aten/src/THC/generic/THCTensorTopK.cu +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -29,21 +29,24 @@ THC_API void THCTensor_(topk)(THCState* state, THCTensor_(resize)(state, topK, topKSize, {}); THCudaLongTensor_resize(state, indices, topKSize, {}); + // static_cast is required to ensure that the correct type (INDEX_T) + // is provided to the kernel for the arguments. + #define RUN_K(INDEX_T, DIM, DIR) \ gatherTopK \ <<>>( \ inputInfo, \ - sliceSize, \ - k, \ - inputSlices, \ + static_cast(sliceSize), \ + static_cast(k), \ + static_cast(inputSlices), \ /* The actual dimension that the k-selection is running in */ \ /* may have changed from collapseDims() */ \ - inputInfo.strides[collapseInputDim], \ + static_cast(inputInfo.strides[collapseInputDim]), \ topKInfo, \ - topKSlices, \ - topKInfo.strides[collapseTopKDim], \ + static_cast(topKSlices), \ + static_cast(topKInfo.strides[collapseTopKDim]), \ indicesInfo, \ - indicesInfo.strides[collapseIndicesDim]) + static_cast(indicesInfo.strides[collapseIndicesDim])) #define RUN_DIR(INDEX_T, DIM) \ if (dir) { \ @@ -63,6 +66,12 @@ THC_API void THCTensor_(topk)(THCState* state, RUN_DIR(INDEX_T, -1); \ } +#ifdef __HIP_PLATFORM_HCC__ +#define TOPK_WARP_SIZE 64 +#else +#define TOPK_WARP_SIZE 32 +#endif + #define RUN_T(INDEX_T) \ TensorInfo inputInfo = \ getTensorInfo(state, input); \ @@ -96,7 +105,7 @@ THC_API void THCTensor_(topk)(THCState* state, THError("Slice to sort is too large"); \ } \ \ - dim3 block(std::min(THCRoundUp(sliceSize, (int64_t) 32), (int64_t) 1024)); \ + dim3 block(std::min(THCRoundUp(sliceSize, (int64_t) TOPK_WARP_SIZE), (int64_t) 1024)); \ \ /* This is used as a template parameter to calculate indices. */ \ /* We only specialize it if all collapsed dim sizes are the */ \ @@ -124,6 +133,7 @@ THC_API void THCTensor_(topk)(THCState* state, #undef RUN_DIM #undef RUN_DIR #undef RUN_K +#undef TOPK_WARP_SIZE // Sort the results if the user wants them sorted, since our // selection routine does not ensure sorting diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 51858c802fe..7cf91fad7a9 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -558,6 +558,7 @@ endif() if(USE_ROCM) include_directories(SYSTEM ${HIP_PATH}/include) include_directories(SYSTEM ${ROCBLAS_PATH}/include) + include_directories(SYSTEM ${ROCFFT_PATH}/include) include_directories(SYSTEM ${HIPSPARSE_PATH}/include) include_directories(SYSTEM ${HIPRAND_PATH}/include) include_directories(SYSTEM ${ROCRAND_PATH}/include) diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 037c92d2fe3..72e6dd67a7b 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -38,6 +38,13 @@ ELSE() SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH}) ENDIF() +# ROCFFT_PATH +IF(NOT DEFINED ENV{ROCFFT_PATH}) + SET(ROCBLAS_PATH ${ROCM_PATH}/rocfft) +ELSE() + SET(ROCFFT_PATH $ENV{ROCFFT_PATH}) +ENDIF() + # HIPSPARSE_PATH IF(NOT DEFINED ENV{HIPSPARSE_PATH}) SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse) @@ -106,11 +113,13 @@ IF(HIP_FOUND) set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) + set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) find_package(rocrand REQUIRED) find_package(hiprand REQUIRED) find_package(rocblas REQUIRED) + find_package(rocfft REQUIRED) find_package(miopen REQUIRED) #find_package(hipsparse REQUIRED) diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh index e93a5bc9557..120db9d7452 100644 --- a/docker/caffe2/jenkins/common/install_rocm.sh +++ b/docker/caffe2/jenkins/common/install_rocm.sh @@ -5,6 +5,7 @@ set -ex install_ubuntu() { apt-get update apt-get install -y wget + apt-get install -y libopenblas-dev DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian # Add rocm repository @@ -63,6 +64,15 @@ install_rocrand() { dpkg -i /opt/rocm/debians/rocrand.deb } +# Install rocSPARSE/hipSPARSE that will be released soon - can co-exist w/ hcSPARSE which will be removed soon +install_hipsparse() { + mkdir -p /opt/rocm/debians + curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.1.0.deb -o /opt/rocm/debians/rocsparse.deb + curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.1.0.deb -o /opt/rocm/debians/hipsparse.deb + dpkg -i /opt/rocm/debians/rocsparse.deb + dpkg -i /opt/rocm/debians/hipsparse.deb +} + # Install Python packages depending on the base OS if [ -f /etc/lsb-release ]; then install_ubuntu @@ -76,3 +86,4 @@ fi install_hip_thrust install_rocrand install_hcsparse +install_hipsparse diff --git a/setup.py b/setup.py index f560ee28b81..75f1c33208c 100644 --- a/setup.py +++ b/setup.py @@ -929,6 +929,7 @@ if USE_ROCM: rocm_include_path = '/opt/rocm/include' hcc_include_path = '/opt/rocm/hcc/include' rocblas_include_path = '/opt/rocm/rocblas/include' + rocfft_include_path = '/opt/rocm/rocfft/include' hipsparse_include_path = '/opt/rocm/hcsparse/include' hiprand_include_path = '/opt/rocm/hiprand/include' rocrand_include_path = '/opt/rocm/rocrand/include' @@ -937,6 +938,7 @@ if USE_ROCM: include_dirs.append(rocm_include_path) include_dirs.append(hcc_include_path) include_dirs.append(rocblas_include_path) + include_dirs.append(rocfft_include_path) include_dirs.append(hipsparse_include_path) include_dirs.append(hiprand_include_path) include_dirs.append(rocrand_include_path) diff --git a/test/common_cuda.py b/test/common_cuda.py index 14554962728..60c28b2818c 100644 --- a/test/common_cuda.py +++ b/test/common_cuda.py @@ -2,12 +2,14 @@ r"""This file is allowed to initialize CUDA context when imported.""" import torch import torch.cuda +from common import TEST_WITH_ROCM TEST_CUDA = torch.cuda.is_available() TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2 CUDA_DEVICE = TEST_CUDA and torch.device("cuda:0") -TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)) +# note: if ROCm is targeted, TEST_CUDNN is code for TEST_MIOPEN +TEST_CUDNN = TEST_CUDA and (TEST_WITH_ROCM or torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE))) TEST_CUDNN_VERSION = TEST_CUDNN and torch.backends.cudnn.version() diff --git a/test/common_nn.py b/test/common_nn.py index 0444ba4eb6a..f159fe65967 100644 --- a/test/common_nn.py +++ b/test/common_nn.py @@ -7,7 +7,7 @@ from itertools import product import torch import torch.cuda from torch.nn.functional import _Reduction -from common import TestCase, to_gpu, freeze_rng_state, is_iterable +from common import TestCase, to_gpu, freeze_rng_state, is_iterable, TEST_WITH_ROCM from common_cuda import TEST_CUDA from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors import torch.backends.cudnn @@ -40,7 +40,8 @@ module_tests = [ module_name='Linear', constructor_args=(10, 8), input_size=(4, 10), - reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8) + reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Linear', @@ -102,17 +103,20 @@ module_tests = [ constructor_args=(1,), input_size=(10, 20), reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20)), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Softmax2d', input_size=(1, 3, 10, 20), reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, False)), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LogSoftmax', constructor_args=(1,), input_size=(10, 20), reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_(), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LogSoftmax', @@ -120,12 +124,14 @@ module_tests = [ input_size=(1, 3, 10, 20), reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(), desc='multiparam', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='ELU', constructor_args=(2.,), input_size=(3, 2, 5), - reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1)) + reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1)), + test_cuda=(not TEST_WITH_ROCM), ), # TODO: reference function dict( @@ -198,6 +204,7 @@ module_tests = [ input_size=(2, 3, 4), desc='1d_multiparam', reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0], + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='PReLU', @@ -211,6 +218,7 @@ module_tests = [ input_size=(2, 3, 4, 5), desc='2d_multiparam', reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0], + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='PReLU', @@ -224,26 +232,31 @@ module_tests = [ input_size=(2, 3, 4, 5, 6), desc='3d_multiparam', reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0], + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Softsign', input_size=(3, 2, 5), reference_fn=lambda i, _: i.div(1 + torch.abs(i)), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Softmin', constructor_args=(1,), input_size=(10, 20), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Softmin', constructor_args=(1,), input_size=(2, 3, 5, 10), desc='multidim', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Tanhshrink', - input_size=(2, 3, 4, 5) + input_size=(2, 3, 4, 5), + test_cuda=(not TEST_WITH_ROCM) ), ] @@ -560,6 +573,7 @@ criterion_tests = [ reference_fn=lambda i, t, m: kldivloss_reference(i, t, get_reduction(m)), check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='MSELoss', @@ -576,6 +590,7 @@ criterion_tests = [ reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() / (i.numel() if get_reduction(m) else 1), check_gradgrad=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='BCELoss', @@ -586,6 +601,7 @@ criterion_tests = [ (i.numel() if get_reduction(m) else 1), desc='weights', check_gradgrad=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='CrossEntropyLoss', @@ -606,6 +622,7 @@ criterion_tests = [ reference_fn=lambda i, t, m: hingeembeddingloss_reference(i, t, reduction=get_reduction(m)), check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='HingeEmbeddingLoss', @@ -616,6 +633,7 @@ criterion_tests = [ hingeembeddingloss_reference(i, t, margin=0.5, reduction=get_reduction(m)), desc='margin', check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='MultiLabelMarginLoss', @@ -642,6 +660,7 @@ criterion_tests = [ target_fn=lambda: torch.rand(5, 10).mul(2).floor(), reference_fn=lambda i, t, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()).sum() / i.numel(), check_gradgrad=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='MultiMarginLoss', @@ -720,6 +739,7 @@ criterion_tests = [ reference_fn=lambda i, t, m: cosineembeddingloss_reference(i[0], i[1], t, reduction=get_reduction(m)), check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='CosineEmbeddingLoss', @@ -730,6 +750,7 @@ criterion_tests = [ cosineembeddingloss_reference(i[0], i[1], t, margin=0.7, reduction=get_reduction(m)), desc='margin', check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='MarginRankingLoss', @@ -738,6 +759,7 @@ criterion_tests = [ reference_fn=lambda i, t, m: marginrankingloss_reference(i[0], i[1], t, reduction=get_reduction(m)), check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='MarginRankingLoss', @@ -748,6 +770,7 @@ criterion_tests = [ marginrankingloss_reference(i[0], i[1], t, margin=0.5, reduction=get_reduction(m)), desc='margin', check_sum_reduction=True, + test_cuda=(not TEST_WITH_ROCM) ), ] diff --git a/test/run_test.py b/test/run_test.py index 8fd32b7e75c..71b96e78bc9 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -45,14 +45,10 @@ WINDOWS_BLACKLIST = [ ROCM_BLACKLIST = [ 'c10d', 'cpp_extensions', - 'cuda', 'distributed', 'distributions', - 'jit', - 'legacy_nn', 'multiprocessing', 'nccl', - 'nn', 'thd_distributed', 'utils', ] diff --git a/test/test_cuda.py b/test/test_cuda.py index badcc687945..27e21b8345e 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -16,7 +16,7 @@ from torch._six import inf, nan from test_torch import TestTorch from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \ - PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN + PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, skipIfRocm, TEST_WITH_ROCM # We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here, # because if we do that, the TEST_CUDNN line from common_cuda will be executed @@ -248,45 +248,62 @@ def new_t(*sizes): # - disable inplace test, if set to True, no inplace test will be done (default=False) # - decorator, e.g., unittest.skipIf (default is no decorator) tests = [ - ('add', small_3d, lambda t: [number(3.14, 3, t)]), + ('add', small_3d, lambda t: [number(3.14, 3, t)], '', types, False, + "skipIfRocm:ByteTensor,CharTensor,HalfTensor,ShortTensor"), ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'), - ('sub', small_3d, lambda t: [number(3.14, 3, t)],), + ('sub', small_3d, lambda t: [number(3.14, 3, t)], '', types, False, + "skipIfRocm:ByteTensor,CharTensor,HalfTensor,ShortTensor"), ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), - ('mul', small_3d, lambda t: [number(3.14, 3, t)],), + ('mul', small_3d, lambda t: [number(3.14, 3, t)], '', types, False, + "skipIfRocm:ByteTensor,CharTensor,HalfTensor,ShortTensor"), ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), - ('div', small_3d, lambda t: [number(3.14, 3, t)],), + ('div', small_3d, lambda t: [number(3.14, 3, t)], '', types, False, + "skipIfRocm:ByteTensor,CharTensor,FloatTensor,HalfTensor,ShortTensor"), ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), - ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types), - ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types), - ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types), - ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types), - ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types), + ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types, False, "skipIfRocm:HalfTensor"), + ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types, False, "skipIfRocm:HalfTensor"), + ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types, False, "skipIfRocm:HalfTensor"), + ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types, False, "skipIfRocm:HalfTensor"), + ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types, False, "skipIfRocm:HalfTensor"), # HalfTensor gives bad result at pow-2 with data sampled from torch.randn - ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half), - ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types), - ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types), - ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'), - ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'), - ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],), - ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'), - ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'), - ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],), - ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), - tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], 'scalar'), - ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)],), - ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'), - ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)],), - ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'), - ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'), - ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],), - ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'), - ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'), - ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)],), - ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'), - ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'), + ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half, + False, "skipIfRocm:HalfTensor,FloatTensor"), + ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types, False, "skipIfRocm:HalfTensor"), + ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types, False, "skipIfRocm:HalfTensor"), + ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar', + types, False, "skipIfRocm:HalfTensor"), + ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars', + types, False, "skipIfRocm:HalfTensor"), + ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)], '', types, False, "skipIfRocm:HalfTensor"), + ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar', + types, False, "skipIfRocm:HalfTensor"), + ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars', + types, False, "skipIfRocm:HalfTensor"), + ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], '', + types, False, "skipIfRocm:HalfTensor"), + ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], + 'scalar', types, False, "skipIfRocm:HalfTensor"), + ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)], '', types, False, "skipIfRocm:HalfTensor"), + ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar', + types, False, "skipIfRocm:HalfTensor"), + ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)], '', types, False, "skipIfRocm:HalfTensor"), + ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar', + types, False, "skipIfRocm:HalfTensor"), + ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars', + types, False, "skipIfRocm:HalfTensor"), + ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"), + ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar', + types, False, "skipIfRocm:HalfTensor"), + ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars', + types, False, "skipIfRocm:HalfTensor"), + ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"), + ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar', + types, False, "skipIfRocm:HalfTensor"), + ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars', + types, False, "skipIfRocm:HalfTensor"), ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]), - ('fmod', small_3d, lambda t: [3], 'value'), + ('fmod', small_3d, lambda t: [3], 'value', types, False, "skipIfRocm:HalfTensor"), ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), ('chunk', medium_2d, lambda t: [4],), ('chunk', medium_2d, lambda t: [4, 1], 'dim'), @@ -296,15 +313,15 @@ tests = [ ('clone', medium_2d, lambda t: [],), ('contiguous', medium_2d, lambda t: [],), ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],), - ('cumprod', small_3d, lambda t: [1],), - ('cumprod', small_3d, lambda t: [-1], 'neg_dim'), - ('cumsum', small_3d, lambda t: [1],), - ('cumsum', small_3d, lambda t: [-1], 'neg_dim'), + ('cumprod', small_3d, lambda t: [1], '', types, False, "skipIfRocm:HalfTensor"), + ('cumprod', small_3d, lambda t: [-1], 'neg_dim', types, False, "skipIfRocm:HalfTensor"), + ('cumsum', small_3d, lambda t: [1], '', types, False, "skipIfRocm:HalfTensor"), + ('cumsum', small_3d, lambda t: [-1], 'neg_dim', types, False, "skipIfRocm:HalfTensor"), ('dim', small_3d, lambda t: [],), - ('dist', small_2d, lambda t: [small_2d(t)],), - ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'), - ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'), - ('dot', medium_1d, lambda t: [medium_1d(t)],), + ('dist', small_2d, lambda t: [small_2d(t)], '', types, False, "skipIfRocm:HalfTensor"), + ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm', types, False, "skipIfRocm:HalfTensor"), + ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm', types, False, "skipIfRocm:HalfTensor"), + ('dot', medium_1d, lambda t: [medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"), ('element_size', medium_1d, lambda t: [],), ('eq', small_3d_ones, lambda t: [small_3d(t)],), ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'), @@ -314,7 +331,7 @@ tests = [ ('equal', small_3d_ones, lambda t: [small_3d(t)],), ('expand', new_t(M, 1, M), lambda t: [M, 4, M],), ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],), - ('fill', medium_2d, lambda t: [number(3.14, 3, t)],), + ('fill', medium_2d, lambda t: [number(3.14, 3, t)], '', types, False, "skipIfRocm:HalfTensor"), ('ge', medium_2d, lambda t: [medium_2d(t)],), ('le', medium_2d, lambda t: [medium_2d(t)],), ('gt', medium_2d, lambda t: [medium_2d(t)],), @@ -328,31 +345,33 @@ tests = [ ('kthvalue', small_3d_unique, lambda t: [3],), ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'), ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'), - ('lerp', small_3d, lambda t: [small_3d(t), 0.3],), - ('max', small_3d_unique, lambda t: [],), - ('max', small_3d_unique, lambda t: [1], 'dim'), - ('max', small_3d_unique, lambda t: [-1], 'neg_dim'), + ('lerp', small_3d, lambda t: [small_3d(t), 0.3], '', types, False, "skipIfRocm:HalfTensor"), + ('max', small_3d_unique, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), + ('max', small_3d_unique, lambda t: [1], 'dim', types, False, skipIfRocm), + ('max', small_3d_unique, lambda t: [-1], 'neg_dim', types, False, skipIfRocm), ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'), - ('min', small_3d_unique, lambda t: [],), - ('min', small_3d_unique, lambda t: [1], 'dim'), - ('min', small_3d_unique, lambda t: [-1], 'neg_dim'), + ('min', small_3d_unique, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), + ('min', small_3d_unique, lambda t: [1], 'dim', types, False, skipIfRocm), + ('min', small_3d_unique, lambda t: [-1], 'neg_dim', types, False, skipIfRocm), ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'), - ('mean', small_3d, lambda t: [],), - ('mean', small_3d, lambda t: [-1], 'neg_dim'), - ('mean', small_3d, lambda t: [1], 'dim'), - ('mode', small_3d, lambda t: [],), - ('mode', small_3d, lambda t: [1], 'dim'), - ('mode', small_3d, lambda t: [-1], 'neg_dim'), - ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half), - ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half), - ('remainder', small_3d, lambda t: [3], 'value'), + ('mean', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), + ('mean', small_3d, lambda t: [-1], 'neg_dim', types, False, "skipIfRocm:DoubleTensor,FloatTensor,HalfTensor"), + ('mean', small_3d, lambda t: [1], 'dim', types, False, "skipIfRocm:DoubleTensor,FloatTensor,HalfTensor"), + ('mode', small_3d, lambda t: [], '', types, False, skipIfRocm), + ('mode', small_3d, lambda t: [1], 'dim', types, False, skipIfRocm), + ('mode', small_3d, lambda t: [-1], 'neg_dim', types, False, skipIfRocm), + ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half, + False, "skipIfRocm:DoubleTensor,FloatTensor"), + ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half, + False, "skipIfRocm:DoubleTensor,FloatTensor"), + ('remainder', small_3d, lambda t: [3], 'value', types, False, "skipIfRocm:HalfTensor"), ('remainder', small_3d, lambda t: [-3], 'negative_value', signed_types), ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), ('remainder', small_3d, lambda t: [constant_tensor_sub(0, small_3d_positive(t))], 'negative_tensor', signed_types), - ('std', small_3d, lambda t: [],), + ('std', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), ('std', small_3d, lambda t: [1], 'dim'), ('std', small_3d, lambda t: [-1], 'neg_dim'), - ('var', small_3d, lambda t: [],), + ('var', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), ('var', small_3d, lambda t: [1], 'dim'), ('var', small_3d, lambda t: [-1], 'neg_dim'), ('ndimension', small_3d, lambda t: [],), @@ -360,34 +379,37 @@ tests = [ ('numel', small_3d, lambda t: [],), ('narrow', small_3d, lambda t: [1, 3, 2],), ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'), - ('nonzero', small_3d, lambda t: [],), - ('norm', small_3d, lambda t: [],), - ('norm', small_3d, lambda t: [3], '3_norm'), - ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'), - ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'), + ('nonzero', small_3d, lambda t: [], '', types, False, skipIfRocm), + ('norm', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), + ('norm', small_3d, lambda t: [3], '3_norm', types, False, "skipIfRocm:HalfTensor"), + ('norm', small_3d, lambda t: [3, 0], '3_norm_dim', types, False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"), + ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim', types, + False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"), ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],), ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],), - ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])],), + ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])], '', types, False, skipIfRocm), ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'), ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'), - ('prod', small_2d_oneish, lambda t: [],), - ('prod', small_3d, lambda t: [1], 'dim'), - ('prod', small_3d, lambda t: [-1], 'neg_dim'), - ('sum', small_2d, lambda t: [],), - ('sum', small_3d, lambda t: [1], 'dim'), - ('sum', small_3d, lambda t: [-1], 'neg_dim'), - ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'), - ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'), - ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'), + ('prod', small_2d_oneish, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), + ('prod', small_3d, lambda t: [1], 'dim', types, False, skipIfRocm), + ('prod', small_3d, lambda t: [-1], 'neg_dim', types, False, skipIfRocm), + ('sum', small_2d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), + ('sum', small_3d, lambda t: [1], 'dim', types, False, skipIfRocm), + ('sum', small_3d, lambda t: [-1], 'neg_dim', types, False, skipIfRocm), + ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm', types, False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"), + ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim', types, + False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"), + ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm', types, + False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"), ('repeat', small_2d, lambda t: [2, 2, 2],), ('size', new_t(1, 2, 3, 4), lambda t: [],), ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'), ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'), - ('sort', small_3d_unique, lambda t: [],), - ('sort', small_3d_unique, lambda t: [1], 'dim'), - ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'), - ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'), - ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'), + ('sort', small_3d_unique, lambda t: [], '', types, False, skipIfRocm), + ('sort', small_3d_unique, lambda t: [1], 'dim', types, False, skipIfRocm), + ('sort', small_3d_unique, lambda t: [-1], 'neg_dim', types, False, skipIfRocm), + ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending', types, False, skipIfRocm), + ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending', types, False, skipIfRocm), ('split', small_3d, lambda t: [2],), ('split', small_3d, lambda t: [2, 1], 'dim'), ('split', small_3d, lambda t: [2, -3], 'neg_dim'), @@ -395,14 +417,14 @@ tests = [ ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'), ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'), ('t', new_t(1, 2), lambda t: [],), - ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])],), + ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])], '', types, False, skipIfRocm), ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],), ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'), ('to_list', small_3d, lambda t: [],), - ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort'), - ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort'), - ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort'), - ('trace', medium_2d, lambda t: [],), + ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort', types, False, skipIfRocm), + ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort', types, False, skipIfRocm), + ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort', types, False, skipIfRocm), + ('trace', medium_2d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"), ('tril', medium_2d, lambda t: [],), ('tril', medium_2d_expanded, lambda t: [], 'zero_stride', types, True), ('tril', medium_2d, lambda t: [2], 'positive'), @@ -443,7 +465,7 @@ tests = [ unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), ('qr', large_2d_lapack, lambda t: [], 'big', float_types, False, unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), - ('inverse', new_t(20, 20), lambda t: [], None, float_types, False), + ('inverse', new_t(20, 20), lambda t: [], None, float_types, False, "skipIfRocm:DoubleTensor,FloatTensor"), ('geqrf', new_t(20, 20), lambda t: [], None, float_types, False, unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), ('svd', new_t(10, 10), lambda t: [], 'square', float_types_no_half, False, @@ -760,6 +782,7 @@ class TestCuda(TestCase): pass @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_memory_stats_multigpu(self): # advance a generator with a end flag def advance(gen, end): @@ -797,6 +820,7 @@ class TestCuda(TestCase): t += 1 @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_autogpu(self): x = torch.randn(5, 5).cuda() y = torch.randn(5, 5).cuda() @@ -814,6 +838,7 @@ class TestCuda(TestCase): self.assertEqual(z.get_device(), 0) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_new(self): x = torch.randn(3, 3).cuda() self.assertEqual(x.new([0, 1, 2]).get_device(), 0) @@ -824,6 +849,7 @@ class TestCuda(TestCase): self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_copy_device(self): x = torch.randn(5, 5).cuda() with torch.cuda.device(1): @@ -877,6 +903,7 @@ class TestCuda(TestCase): self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_type_conversions_same_gpu(self): x = torch.randn(5, 5).cuda(1) self.assertEqual(x.int().get_device(), 1) @@ -896,12 +923,15 @@ class TestCuda(TestCase): if input.is_cuda and input.get_device() == i: self.assertEqual(t.data_ptr(), input.data_ptr()) + @skipIfRocm def test_broadcast_cpu(self): self._test_broadcast(torch.randn(5, 5)) + @skipIfRocm def test_broadcast_gpu(self): self._test_broadcast(torch.randn(5, 5).cuda()) + @skipIfRocm def test_min_max_nan(self): tests = [(lambda x: x.min(), 'min'), (lambda x: x.max(), 'max'), @@ -932,6 +962,7 @@ class TestCuda(TestCase): self.assertIsInstance(bct, type(bt)) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_broadcast_coalesced(self): numel = 5 num_bytes = numel * 8 @@ -952,6 +983,7 @@ class TestCuda(TestCase): self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_broadcast_coalesced_dense_only(self): numel = 5 num_bytes = numel * 8 @@ -966,6 +998,7 @@ class TestCuda(TestCase): self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_reduce_add(self): x = torch.randn(5, 5) y = torch.randn(5, 5) @@ -992,6 +1025,7 @@ class TestCuda(TestCase): self.assertEqual(rc.type(), r.type()) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_reduce_add_coalesced(self): numel = 5 num_bytes = numel * 8 @@ -1012,6 +1046,7 @@ class TestCuda(TestCase): self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_reduce_add_coalesced_dense_only(self): numel = 5 num_bytes = numel * 8 @@ -1083,12 +1118,15 @@ class TestCuda(TestCase): index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim)) self.assertEqual(result[tuple(index)], y) + @skipIfRocm def test_gather(self): self._test_gather(0) + @skipIfRocm def test_gather_dim(self): self._test_gather(1) + @skipIfRocm def test_from_sequence(self): seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)] reference = torch.arange(0, 20).resize_(5, 4) @@ -1119,6 +1157,7 @@ class TestCuda(TestCase): self.assertEqual(torch.cuda.initial_seed(), 2) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_cat_autogpu(self): x = torch.randn(4, 4).cuda(1) y = torch.randn(4, 4).cuda(1) @@ -1146,9 +1185,11 @@ class TestCuda(TestCase): z = torch.cat([x, y]) self.assertEqual(z.size(), (21, SIZE, SIZE)) + @skipIfRocm def test_cat_empty_legacy(self): TestTorch._test_cat_empty_legacy(self, use_cuda=True) + @skipIfRocm def test_cat_empty(self): TestTorch._test_cat_empty(self, use_cuda=True) @@ -1203,6 +1244,7 @@ class TestCuda(TestCase): self.assertEqual(copy.get_device(), original.get_device()) @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + @skipIfRocm def test_multigpu_serialization(self): x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] with tempfile.NamedTemporaryFile() as f: @@ -1215,6 +1257,7 @@ class TestCuda(TestCase): self.assertEqual(copy.get_device(), original.get_device()) @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + @skipIfRocm def test_multigpu_serialization_remap(self): x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] @@ -1233,6 +1276,7 @@ class TestCuda(TestCase): self.assertEqual(copy.get_device(), 0) @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + @skipIfRocm def test_multigpu_serialization_remap_dict(self): x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] with tempfile.NamedTemporaryFile() as f: @@ -1245,6 +1289,7 @@ class TestCuda(TestCase): self.assertEqual(copy.get_device(), 0) @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + @skipIfRocm def test_cuda_set_device(self): x = torch.randn(5, 5) with torch.cuda.device(1): @@ -1266,6 +1311,7 @@ class TestCuda(TestCase): def test_cuda_synchronize(self): torch.cuda.synchronize() + @skipIfRocm def test_streams(self): default_stream = torch.cuda.current_stream() user_stream = torch.cuda.Stream() @@ -1284,6 +1330,7 @@ class TestCuda(TestCase): self.assertTrue(default_stream.query()) @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + @skipIfRocm def test_streams_multi_gpu(self): default_stream = torch.cuda.current_stream() self.assertEqual(default_stream.device, 0) @@ -1294,6 +1341,7 @@ class TestCuda(TestCase): self.assertNotEqual(torch.cuda.current_stream(), default_stream) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_tensor_device(self): self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0) self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1) @@ -1302,6 +1350,7 @@ class TestCuda(TestCase): self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0) self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1) + @skipIfRocm def test_events(self): stream = torch.cuda.current_stream() event = torch.cuda.Event(enable_timing=True) @@ -1315,6 +1364,7 @@ class TestCuda(TestCase): self.assertTrue(event.query()) self.assertGreater(start_event.elapsed_time(event), 0) + @skipIfRocm def test_record_stream(self): cycles_per_ms = get_cycles_per_ms() @@ -1352,6 +1402,7 @@ class TestCuda(TestCase): x = torch.arange(0, 10).view((2, 5)) self.assertEqual(x.t(), x.t().pin_memory()) + @skipIfRocm def test_caching_pinned_memory(self): cycles_per_ms = get_cycles_per_ms() @@ -1372,6 +1423,7 @@ class TestCuda(TestCase): self.assertEqual(list(gpu_tensor), [1]) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_caching_pinned_memory_multi_gpu(self): # checks that the events preventing pinned memory from being re-used # too early are recorded on the correct GPU @@ -1432,6 +1484,7 @@ class TestCuda(TestCase): def test_signal_window_functions(self): TestTorch._test_signal_window_functions(self, device=torch.device('cuda')) + @skipIfRocm def test_fft_ifft_rfft_irfft(self): TestTorch._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda')) @@ -1463,6 +1516,7 @@ class TestCuda(TestCase): def test_stft(self): TestTorch._test_stft(self, device=torch.device('cuda')) + @skipIfRocm def test_multinomial(self): TestTorch._test_multinomial(self, torch.cuda.FloatTensor) @@ -1528,6 +1582,7 @@ class TestCuda(TestCase): self._spawn_method(test_method, torch.Tensor([0, -inf])) self._spawn_method(test_method, torch.Tensor([0, nan])) + @skipIfRocm def test_broadcast(self): TestTorch._test_broadcast(self, lambda t: t.cuda()) @@ -1540,12 +1595,15 @@ class TestCuda(TestCase): def test_broadcast_batched_matmul(self): TestTorch._test_broadcast_batched_matmul(self, lambda t: t.cuda()) + @skipIfRocm def test_index(self): TestTorch._test_index(self, lambda t: t.cuda()) + @skipIfRocm def test_advancedindex(self): TestTorch._test_advancedindex(self, lambda t: t.cuda()) + @skipIfRocm def test_advancedindex_mixed_cpu_cuda(self): def test(x, ia, ib): # test getitem @@ -1594,30 +1652,37 @@ class TestCuda(TestCase): ib = ib.to(other_device) test(x, ia, ib) + @skipIfRocm def test_advancedindex_big(self): TestTorch._test_advancedindex_big(self, lambda t: t.cuda()) + @skipIfRocm def test_btrifact(self): TestTorch._test_btrifact(self, lambda t: t.cuda()) + @skipIfRocm def test_btrisolve(self): TestTorch._test_btrisolve(self, lambda t: t.cuda()) + @skipIfRocm def test_dim_reduction(self): TestTorch._test_dim_reduction(self, lambda t: t.cuda()) + @skipIfRocm def test_tensor_gather(self): TestTorch._test_gather(self, lambda t: t.cuda(), False) def test_tensor_scatter(self): TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', test_bounds=False) + @skipIfRocm def test_tensor_scatterAdd(self): TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_add_', test_bounds=False) def test_tensor_scatterFill(self): TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', True, test_bounds=False) + @skipIfRocm def test_min_max_inits(self): # Testing if THC_reduceAll received the correct index initialization. # This affects the result of THC_reduceAll operations at extreme values @@ -1692,6 +1757,7 @@ class TestCuda(TestCase): tensor = tensor.unsqueeze(1) self.assertEqual(tensor.var(0), 0.03125) + @skipIfRocm def test_digamma(self): def test(use_double=False): cpu_tensor = torch.randn(10, 10, 10) @@ -1720,6 +1786,7 @@ class TestCuda(TestCase): norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out self.assertEqual(norm_errors, expected_errors) + @skipIfRocm def test_polygamma(self): def test(use_double=False): cpu_tensor = torch.randn(10, 10, 10) @@ -1771,6 +1838,7 @@ class TestCuda(TestCase): TestTorch._test_trtrs(self, lambda t: t.cuda()) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm def test_get_set_rng_state_all(self): states = torch.cuda.get_rng_state_all() before0 = torch.cuda.FloatTensor(100, device=0).normal_() @@ -1781,12 +1849,14 @@ class TestCuda(TestCase): self.assertEqual(before0, after0, 0) self.assertEqual(before1, after1, 0) + @skipIfRocm def test_nvtx(self): # Just making sure we can see the symbols torch.cuda.nvtx.range_push("foo") torch.cuda.nvtx.mark("bar") torch.cuda.nvtx.range_pop() + @skipIfRocm def test_randperm_cuda(self): cuda = torch.device('cuda:0') @@ -1825,6 +1895,7 @@ class TestCuda(TestCase): def test_random_neg_values(self): TestTorch._test_random_neg_values(self, use_cuda=True) + @skipIfRocm def test_bincount_cuda(self): TestTorch._test_bincount(self, device='cuda') # ensure CUDA code coverage @@ -1846,6 +1917,7 @@ class TestCuda(TestCase): self.assertEqual(t.cpu().bincount(), t.bincount()) self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) + @skipIfRocm def test_tiny_half_norm_(self): a = torch.arange(25).cuda().float() a /= 100000000 @@ -1909,6 +1981,25 @@ def generate_tests(): if t not in type_subset: continue + if TEST_WITH_ROCM and decorator is not None: + if (isinstance(decorator, str)): + tensor_type_name = str(t.__name__) + decorator_list = decorator.split(":") + skip_type_list = decorator_list[1].split(",") + if (("ByteTensor" in skip_type_list) and tensor_type_name == "ByteTensor") \ + or (("CharTensor" in skip_type_list) and tensor_type_name == "CharTensor") \ + or (("DoubleTensor" in skip_type_list) and tensor_type_name == "DoubleTensor") \ + or (("FloatTensor" in skip_type_list) and tensor_type_name == "FloatTensor") \ + or (("HalfTensor" in skip_type_list) and tensor_type_name == "HalfTensor") \ + or (("IntTensor" in skip_type_list) and tensor_type_name == "IntTensor") \ + or (("LongTensor" in skip_type_list) and tensor_type_name == "LongTensor") \ + or (("ShortTensor" in skip_type_list) and tensor_type_name == "ShortTensor"): + decorator = skipIfRocm + else: + decorator = None + elif ((not TEST_WITH_ROCM) and (decorator is not None)): + if (isinstance(decorator, str)): + decorator = None precision = custom_precision.get(name, TestCuda.precision) if is_half(t): diff --git a/test/test_jit.py b/test/test_jit.py index 4bda08d030c..81e8d1affa1 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -608,6 +608,7 @@ class TestJit(JitTestCase): @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + @skipIfRocm def test_fusion_rand(self): class M(torch.jit.ScriptModule): __constants__ = ['d'] @@ -631,6 +632,7 @@ class TestJit(JitTestCase): @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + @skipIfRocm def test_fusion_arg_configurations(self): # A smoke test to make sure we won't use the same kernel for contiguous # and non-contiguous arguments. @@ -846,6 +848,7 @@ class TestJit(JitTestCase): @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA_MULTI_GPU, "needs non-zero device") + @skipIfRocm def test_fuse_last_device(self): device = 'cuda:' + str(1) x = torch.tensor([0.4], dtype=torch.float, device=device) @@ -2521,6 +2524,7 @@ a") @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "No CUDA") + @skipIfRocm def test_chunk_fusion_cuda(self): def fn(x): a, b, c = x.chunk(3, 1) @@ -2536,6 +2540,7 @@ a") @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "No CUDA") + @skipIfRocm def test_chunk_multiple_fusion_cuda(self): # The arguments are intentionally used out of order as a test to see # if the fusion compiler adds extra args in the correct order @@ -2589,11 +2594,13 @@ a") self.checkScript(fn, [tensor]) @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") + @skipIfRocm def test_chunk_fusion_correctness(self): return self._test_chunk_fusion_correctness(self, 'cpu') @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "No CUDA") + @skipIfRocm def test_chunk_fusion_correctness_cuda(self): return self._test_chunk_fusion_correctness(self, 'cuda') @@ -6457,6 +6464,7 @@ class TestEndToEndHybridFrontendModels(JitTestCase): self.checkTrace(Policy(), (torch.rand(1, 4),)) + @skipIfRocm def test_snli(self): # TODO: # 1) nn.LSTM is called as a Python function https://github.com/pytorch/pytorch/issues/8449 @@ -6549,6 +6557,7 @@ class TestEndToEndHybridFrontendModels(JitTestCase): self.checkTrace(SNLIClassifier(Config()), (premise, hypothesis), inputs_require_grads=False) + @skipIfRocm def test_super_resolution(self): import torch.nn.init as init @@ -6704,6 +6713,7 @@ class TestPytorchExportModes(JitTestCase): export_type=torch.onnx.ExportTypes.DIRECTORY) shutil.rmtree(d) + @skipIfRocm def test_aten_fallback(self): class ModelWithAtenNotONNXOp(nn.Module): def forward(self, x, y): diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py index de65e6fc8ce..b446920c4fe 100644 --- a/test/test_legacy_nn.py +++ b/test/test_legacy_nn.py @@ -6,7 +6,7 @@ from copy import deepcopy import torch import torch.legacy.nn as nn -from common import to_gpu, freeze_rng_state, run_tests +from common import to_gpu, freeze_rng_state, run_tests, skipIfRocm, TEST_WITH_ROCM from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \ module_tests, criterion_tests, PRECISION from torch.autograd.gradcheck import get_numerical_jacobian @@ -66,33 +66,40 @@ tests = [ constructor_args=(3.5,), input_size=(3, 5, 4), reference_fn=lambda i, _: i + 3.5, - check_inplace=True), + check_inplace=True, + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.BatchNormalization, constructor_args=(10,), input_size=(4, 10), - desc='affine'), + desc='affine', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.BatchNormalization, constructor_args=(10, 1e-3, 0.3, False), input_size=(4, 10), - desc='not_affine'), + desc='not_affine', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialBatchNormalization, constructor_args=(3,), - input_size=(2, 3, 6, 6)), + input_size=(2, 3, 6, 6), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialBatchNormalization, constructor_args=(3, 1e-3, 0.8), input_size=(2, 3, 6, 6), - desc='momentum'), + desc='momentum', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialBatchNormalization, constructor_args=(3, 1e-3, 0.8, False), input_size=(2, 3, 6, 6), desc='no_affine'), OldModuleTest(nn.VolumetricBatchNormalization, constructor_args=(3,), - input_size=(2, 3, 4, 4, 4)), + input_size=(2, 3, 4, 4, 4), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.VolumetricBatchNormalization, constructor_args=(3, 1e-3, 0.7), input_size=(2, 3, 4, 4, 4), - desc='momentum'), + desc='momentum', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.VolumetricBatchNormalization, constructor_args=(3, 1e-3, 0.7, False), input_size=(2, 3, 4, 4, 4), @@ -100,52 +107,67 @@ tests = [ OldModuleTest(nn.CMul, constructor_args=(5, 6), input_size=(10, 5, 6), - desc='3D'), + desc='3D', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.CMul, constructor_args=(50, 4), input_size=(1, 50, 4), - desc='3D_single_example'), + desc='3D_single_example', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.CMul, constructor_args=(1, 5), input_fn=lambda: torch.randn(10, 3, 5)[:, 1], - desc='3D_noncontiguous'), + desc='3D_noncontiguous', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Exp, input_size=(2, 3, 4), - reference_fn=lambda i, _: i.exp()), + reference_fn=lambda i, _: i.exp(), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Log, input_fn=lambda: torch.rand(2, 3, 2) + 0.1, - reference_fn=lambda i, _: i.log()), + reference_fn=lambda i, _: i.log(), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Clamp, constructor_args=(-2., 5.), input_fn=lambda: torch.randn(3, 2, 50) * 6, reference_fn=lambda i, _: i.clamp(-2, 5)), OldModuleTest(nn.Abs, input_size=(3, 20, 5), - reference_fn=lambda i, _: i.abs()), + reference_fn=lambda i, _: i.abs(), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Bilinear, constructor_args=(2, 3, 10), - input_size=[(4, 2), (4, 3)]), + input_size=[(4, 2), (4, 3)], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Bilinear, constructor_args=(5, 4, 2), input_size=[(2, 5), (2, 4)], - desc='small_output'), + desc='small_output', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Euclidean, constructor_args=(5, 7), - input_size=(10, 5)), + input_size=(10, 5), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.WeightedEuclidean, constructor_args=(5, 7), - input_size=(10, 5)), + input_size=(10, 5), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Cosine, constructor_args=(5, 7), - input_size=(10, 5)), + input_size=(10, 5), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.CAddTable, - input_size=[(5, 7), (5, 7)]), + input_size=[(5, 7), (5, 7)], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.CSubTable, - input_size=[(5, 7), (5, 7)]), + input_size=[(5, 7), (5, 7)], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.CDivTable, - input_fn=lambda: [torch.randn(1, 7), torch.rand(1, 7) + 0.1]), + input_fn=lambda: [torch.randn(1, 7), torch.rand(1, 7) + 0.1], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.CMulTable, - input_size=[(5, 7), (5, 7)]), + input_size=[(5, 7), (5, 7)], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Square, input_size=(10, 2, 4), reference_fn=lambda i, _: i.mul(i)), @@ -192,31 +214,37 @@ tests = [ OldModuleTest(nn.Sum, constructor_args=(1,), input_size=(2, 4, 5), - reference_fn=lambda i, _: i.sum(1, keepdim=False)), + reference_fn=lambda i, _: i.sum(1, keepdim=False), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Sum, constructor_args=(1, True), input_size=(2, 4, 5), reference_fn=lambda i, _: i.sum(1, keepdim=False).div(i.size(1)), - desc='sizeAverage'), + desc='sizeAverage', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Mean, constructor_args=(1,), input_size=(2, 4, 5), - reference_fn=lambda i, _: torch.mean(i, 1, keepdim=False)), + reference_fn=lambda i, _: torch.mean(i, 1, keepdim=False), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(lambda: nn.Sequential().add(nn.GradientReversal()).add(nn.GradientReversal()), input_size=(4, 3, 2, 2), - fullname='GradientReversal'), + fullname='GradientReversal', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Identity, input_size=(4, 3, 2, 4), reference_fn=lambda i, _: i), OldModuleTest(nn.DotProduct, input_size=[(10, 4), (10, 4)], reference_fn=lambda i, _: torch.Tensor(list( - a.dot(b) for a, b in zip(i[0], i[1]))) + a.dot(b) for a, b in zip(i[0], i[1]))), + test_cuda=(not TEST_WITH_ROCM) ), OldModuleTest(nn.CosineDistance, input_size=[(10, 4), (10, 4)], reference_fn=lambda i, _: torch.Tensor(list( - a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1]))) + a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1]))), + test_cuda=(not TEST_WITH_ROCM) ), OldModuleTest(nn.JoinTable, constructor_args=(0,), @@ -256,19 +284,23 @@ tests = [ reference_fn=lambda i, _: torch.min(i, 1, False)[0], desc='with_dimension'), OldModuleTest(nn.MixtureTable, - input_size=[(5, 3), (5, 3, 6)]), + input_size=[(5, 3), (5, 3, 6)], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.LookupTable, constructor_args=(4, 3), input_fn=lambda: torch.randperm(2).repeat(1, 2), - jacobian_input=False), + jacobian_input=False, + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Mul, input_size=(2, 3, 4, 2), - reference_fn=lambda i, p: i * p[0][0]), + reference_fn=lambda i, p: i * p[0][0], + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.MulConstant, constructor_args=(4,), input_size=(2, 3, 4, 2), reference_fn=lambda i, _: i * 4, - check_inplace=True), + check_inplace=True, + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Narrow, constructor_args=(0, 0), input_size=(2, 3, 4, 2), @@ -291,7 +323,8 @@ tests = [ OldModuleTest(nn.Replicate, constructor_args=(2, 1), input_size=(10, 3, 4, 5), - reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5)), + reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Padding, constructor_args=(0, 2, -10), input_size=(2, 3, 4, 5)), @@ -305,17 +338,21 @@ tests = [ desc='negative_pad'), OldModuleTest(nn.PartialLinear, constructor_args=(5, 6), - input_size=(4, 5)), + input_size=(4, 5), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(lambda: nn.PartialLinear(5, 6).setPartition(torch.Tensor((2, 4))), input_size=(4, 5), - fullname='PartialLinear_setPartition'), + fullname='PartialLinear_setPartition', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Power, constructor_args=(2,), - input_size=(2, 3, 4, 5)), + input_size=(2, 3, 4, 5), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Power, constructor_args=(1.5,), input_fn=lambda: torch.rand(3, 4, 5), - desc='fractional'), + desc='fractional', + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.Reshape, constructor_args=(4, 5), input_size=(3, 4 * 5), @@ -375,10 +412,12 @@ tests = [ desc='stride_pad'), OldModuleTest(nn.SpatialDivisiveNormalization, constructor_args=(3,), - input_size=(2, 3, 8, 8)), + input_size=(2, 3, 8, 8), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialContrastiveNormalization, constructor_args=(3,), - input_size=(2, 3, 8, 8)), + input_size=(2, 3, 8, 8), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialDilatedConvolution, constructor_args=(3, 2, 3, 3, 2, 2, 1, 1, 2, 2), input_size=(2, 3, 8, 8)), @@ -436,13 +475,15 @@ tests = [ input_size=(1, 3, 7, 7)), OldModuleTest(nn.SpatialLPPooling, constructor_args=(3, 2, 2, 2, 2, 2), - input_size=(1, 3, 7, 7)), + input_size=(1, 3, 7, 7), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialSubSampling, constructor_args=(3, 3, 3, 2, 2), input_size=(1, 3, 7, 7)), OldModuleTest(nn.SpatialSubtractiveNormalization, constructor_args=(3,), - input_size=(1, 3, 7, 7)), + input_size=(1, 3, 7, 7), + test_cuda=(not TEST_WITH_ROCM)), OldModuleTest(nn.SpatialSubtractiveNormalization, constructor_args=(3, torch.rand(3)), input_size=(1, 3, 7, 7), @@ -521,7 +562,8 @@ tests = [ CriterionTest(nn.WeightedMSECriterion, constructor_args_fn=lambda: (torch.rand(3, 4, 5),), input_size=(2, 3, 4, 5), - target_size=(2, 3, 4, 5)), + target_size=(2, 3, 4, 5), + test_cuda=(not TEST_WITH_ROCM)), CriterionTest(nn.MarginCriterion, input_size=(5, 10), target_fn=lambda: torch.randn(5, 10).sign()), @@ -544,14 +586,16 @@ for p in (1, 2, 1.5): input_size=(4, 5), # Eh, we need to use p as a default, so it's passed by value reference_fn=lambda i, _, p=p: i.div(i.norm(p, 1, True).expand_as(i)), - desc=str(p)), + desc=str(p), + test_cuda=(not TEST_WITH_ROCM)), ) for p in range(1, 4 + 1): tests.append( OldModuleTest(nn.PairwiseDistance, constructor_args=(p,), input_size=[(4, 10), (4, 10)], - desc=str(p)) + desc=str(p), + test_cuda=(not TEST_WITH_ROCM)) ) @@ -613,6 +657,10 @@ def prepare_tests(): 'KLDivLoss': 'DistKLDivCriterion', } for test in tests: + name = test.get_name() + if ((name == "test_Max" or name == "test_Min" or name == "test_Max_with_dimension" or + name == "test_Min_with_dimension") and TEST_WITH_ROCM): + continue add_test(test) for test_params in module_tests: test_params = deepcopy(test_params) diff --git a/test/test_nn.py b/test/test_nn.py index 994f0af89f9..209b2cd0efa 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -29,7 +29,7 @@ from torch.autograd import Variable, gradcheck from torch.autograd.gradcheck import gradgradcheck from torch.nn import Parameter from torch.nn.parallel._functions import Broadcast -from common import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, \ +from common import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, TEST_WITH_ROCM, \ TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, download_file, PY3, PY34, to_gpu, \ get_function_arglist, skipCUDAMemoryLeakCheckIf from common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, \ @@ -47,8 +47,10 @@ if TEST_NUMPY: import numpy as np ALL_TENSORTYPES = [torch.float, - torch.double, - torch.half] + torch.double] + +if not TEST_WITH_ROCM: + ALL_TENSORTYPES.append(torch.half) NO_HALF_TENSORTYPES = [torch.float, torch.double] @@ -237,6 +239,7 @@ class NewModuleTest(InputVariableMixin, ModuleTest): self.cudnn = kwargs.get('cudnn', False) self.check_inplace = kwargs.get('check_inplace', False) self.check_gradgrad = kwargs.get('check_gradgrad', True) + self.skip_double = kwargs.get('skip_double', False) def _do_test(self, test_case, module, input): test_case.check_jacobian(module, input, self.jacobian_input) @@ -357,21 +360,23 @@ class NewModuleTest(InputVariableMixin, ModuleTest): test_case.assertIsInstance(p, torch.cuda.FloatTensor) test_case.assertEqual(p.get_device(), 1) - # test double() - input = input.double().cuda() - module.double().cuda() - module(input) - for p in module.parameters(): - test_case.assertIsInstance(p, torch.cuda.DoubleTensor) - test_case.assertEqual(p.get_device(), 0) + if not self.skip_double: + # test double() + input = input.double().cuda() + module.double().cuda() + module(input) + for p in module.parameters(): + test_case.assertIsInstance(p, torch.cuda.DoubleTensor) + test_case.assertEqual(p.get_device(), 0) - # test half() - input = input.half().cuda() - module.half().cuda() - module(input) - for p in module.parameters(): - test_case.assertIsInstance(p, torch.cuda.HalfTensor) - test_case.assertEqual(p.get_device(), 0) + if not TEST_WITH_ROCM: + # test half() + input = input.half().cuda() + module.half().cuda() + module(input) + for p in module.parameters(): + test_case.assertIsInstance(p, torch.cuda.HalfTensor) + test_case.assertEqual(p.get_device(), 0) def _get_target(self): return self._get_arg('target', False) @@ -1845,6 +1850,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_embedding_max_norm_cuda(self, dtype=torch.float): embedding = nn.Embedding(22, 5, max_norm=1.0).to("cuda", dtype=dtype) # nn.Embedding only takes LongTensor as input @@ -1932,6 +1938,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_gumbel_softmax_st_cuda(self, dtype=torch.float): self._test_gumbel_softmax_st(True, dtype=dtype) @@ -2099,6 +2106,7 @@ class TestNN(NNTestCase): y.backward(grad) @unittest.skipIf(not TEST_CUDNN, "needs cudnn") + @skipIfRocm def test_contig_wrong_stride_cudnn(self): # x has to have batch_size 1 to test contiguous checks x = torch.randn(1, 16, 5, 5, device="cuda") @@ -2120,6 +2128,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_embedding_bag_cuda(self, dtype=torch.float): self._test_EmbeddingBag(True, 'sum', False, dtype) self._test_EmbeddingBag(True, 'mean', False, dtype) @@ -2169,11 +2178,13 @@ class TestNN(NNTestCase): self._test_dropout(nn.Dropout3d, False, input) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_Dropout_cuda(self): input = torch.Tensor(1000) self._test_dropout(nn.Dropout, True, input) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_Dropout2d_cuda(self): b = random.randint(1, 5) w = random.randint(1, 5) @@ -2183,6 +2194,7 @@ class TestNN(NNTestCase): self._test_dropout(nn.Dropout2d, True, input) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_Dropout3d_cuda(self): b = random.randint(1, 5) w = random.randint(1, 5) @@ -2288,6 +2300,7 @@ class TestNN(NNTestCase): self._test_InstanceNorm_general(nn.InstanceNorm1d, input, dtype=torch.float) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_InstanceNorm1d_general_cuda(self): b = random.randint(3, 5) c = random.randint(3, 5) @@ -2307,6 +2320,7 @@ class TestNN(NNTestCase): self._test_InstanceNorm_general(nn.InstanceNorm2d, input, dtype=torch.float) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_InstanceNorm2d_general_cuda(self): b = random.randint(3, 5) c = random.randint(3, 5) @@ -2328,6 +2342,7 @@ class TestNN(NNTestCase): self._test_InstanceNorm_general(nn.InstanceNorm3d, input, dtype=torch.float) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_InstanceNorm3d_general_cuda(self): b = random.randint(3, 5) c = random.randint(2, 5) @@ -2392,6 +2407,7 @@ class TestNN(NNTestCase): self._test_LayerNorm_general() @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_LayerNorm_general_cuda(self): self._test_LayerNorm_general("cuda") self._test_LayerNorm_cuda_half() @@ -2456,6 +2472,7 @@ class TestNN(NNTestCase): self._test_GroupNorm_general(dtype=torch.float) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_GroupNorm_general_cuda(self): self._test_GroupNorm_general("cuda", torch.float) self._test_GroupNorm_cuda_half() @@ -2571,6 +2588,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_Conv2d_naive_groups_cuda(self, dtype=torch.float): self._test_Conv2d_naive_groups("cuda", dtype) @@ -2578,6 +2596,7 @@ class TestNN(NNTestCase): self._test_batchnorm_eval() @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_batchnorm_eval_cuda(self, dtype=torch.float): self._test_batchnorm_eval("cuda", dtype) @@ -2585,6 +2604,7 @@ class TestNN(NNTestCase): self._test_batchnorm_simple_average() @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_batchnorm_simple_average_cuda(self): self._test_batchnorm_simple_average(torch.cuda.FloatTensor) @@ -2609,6 +2629,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_MaxPool3d_indices_cuda(self, dtype=torch.float): self._test_maxpool_indices(3, device="cuda", dtype=dtype) @@ -2617,6 +2638,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_AdaptiveMaxPool1d_indices_cuda(self, dtype=torch.float): self._test_maxpool_indices(1, adaptive=True, device="cuda", dtype=dtype) @@ -2625,6 +2647,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_AdaptiveMaxPool2d_indices_cuda(self, dtype=torch.float): self._test_maxpool_indices(2, adaptive=True, device="cuda", dtype=dtype) @@ -2632,6 +2655,7 @@ class TestNN(NNTestCase): self._test_maxpool_indices(3, adaptive=True) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm @repeat_test_for_types(ALL_TENSORTYPES) def test_AdaptiveMaxPool3d_indices_cuda(self, dtype=torch.float): self._test_maxpool_indices(3, adaptive=True, device="cuda", dtype=dtype) @@ -2669,10 +2693,12 @@ class TestNN(NNTestCase): _assertGradAndGradgradChecks(self, lambda y: dp.scatter(y, (0, 1)), (x,)) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_scatter_cpu(self): self._test_scatter(torch.randn(4, 4)) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_scatter_gpu(self): self._test_scatter(torch.randn(4, 4).cuda()) @@ -2719,14 +2745,17 @@ class TestNN(NNTestCase): _assertGradAndGradgradChecks(self, lambda x, y: dp.gather((x, y), output_device), inputs) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_gather_cpu(self): self._test_gather(-1) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_gather_gpu(self): self._test_gather(0) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_gather_different_len_dicts(self): inputs = ( {'a': Variable(torch.randn(1, 2).cuda(0), requires_grad=True)}, @@ -2743,12 +2772,14 @@ class TestNN(NNTestCase): _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), variables) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_broadcast_double_backwards_gpu(self): self._test_broadcast_double_backwards(torch.randn(4, 4).cuda(), torch.randn(4, 4).cuda(), torch.randn(4, 4).cuda()) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_broadcast_not_requiring_grad(self): variables = [ Variable(torch.randn(1, 2).cuda(), requires_grad=True), @@ -2763,6 +2794,7 @@ class TestNN(NNTestCase): self.assertEqual(input_var.requires_grad, broadcasted_var.requires_grad) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_broadcast_no_grad(self): x = torch.randn(1, 2, dtype=torch.float32, requires_grad=True, device='cuda') with torch.no_grad(): @@ -2772,6 +2804,7 @@ class TestNN(NNTestCase): self.assertFalse(output.requires_grad) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_replicate(self): module = nn.Linear(10, 5).float().cuda() input = Variable(torch.randn(2, 10).float().cuda()) @@ -2784,6 +2817,7 @@ class TestNN(NNTestCase): self.assertEqual(replica(replica_input).data, expected_output) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_replicate_buffers(self): net = nn.Module() net.bn = nn.BatchNorm2d(10) @@ -2795,6 +2829,7 @@ class TestNN(NNTestCase): self.assertEqual(replica.bn.num_batches_tracked.get_device(), i, 'buffer on wrong device') @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_parallel_apply(self): l1 = nn.Linear(10, 5).to("cuda:0", torch.float) l2 = nn.Linear(10, 5).to("cuda:1", torch.float) @@ -2813,6 +2848,7 @@ class TestNN(NNTestCase): self.assertEqual(out.data, expected) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_multiple_input(self): class TestModule(nn.Module): @@ -2879,6 +2915,7 @@ class TestNN(NNTestCase): local_test(out) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_small_back(self): l = nn.Linear(10, 5).float().cuda() i = Variable(torch.randn(20, 10).float().cuda()) @@ -2886,6 +2923,7 @@ class TestNN(NNTestCase): self.assertEqual(out, l(i)) @unittest.skipIf(not TEST_MULTIGPU or not PY3, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_model_no_refcycles(self): # Python 2.7 will create reference cycles with the following # Module on multiple GPUs, but Python 3 shouldn't unless @@ -2909,6 +2947,7 @@ class TestNN(NNTestCase): self.assertEqual(refcycles, 0) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_no_grad(self): test = self @@ -2924,6 +2963,7 @@ class TestNN(NNTestCase): self.assertRaises(AssertionError, lambda: dp.data_parallel(l, i, (0, 1))) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel(self): l = nn.Linear(10, 5).float().cuda() i = Variable(torch.randn(20, 10).float().cuda(1)) @@ -2952,6 +2992,7 @@ class TestNN(NNTestCase): out = dp.data_parallel(l, i) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_sparse(self): l = nn.Embedding(10, 5, sparse=True).to("cuda:1") i = torch.randint(10, (20, 5), device="cuda:1", dtype=torch.long) @@ -2979,6 +3020,7 @@ class TestNN(NNTestCase): out = dp.data_parallel(l, i) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_nested_output(self): def fn(input): return [ @@ -3010,6 +3052,7 @@ class TestNN(NNTestCase): self.assertIsInstance(output[3]['b'][0], torch.Tensor) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_data_parallel_nested_input(self): def fn(input): return input[1][0] @@ -3358,6 +3401,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') + @skipIfRocm def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self): inputs = Variable(torch.randn(4, 1, 7, 7).float().cuda()) weights = Variable(torch.randn(1, 1, 3, 3).double().cuda()) @@ -3374,6 +3418,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_Conv2d_deterministic_cudnn(self, dtype=torch.float): inputs = torch.randn(2, 3, 5, 5, device="cuda", dtype=dtype, requires_grad=True) with cudnn.flags(enabled=True, benchmark=True, deterministic=True): @@ -3510,6 +3555,7 @@ class TestNN(NNTestCase): # For https://github.com/pytorch/pytorch/pull/1273 # Almost identical to the above `test_Conv2d_naive_groups` + @skipIfRocm def test_Conv2d_groups_nobias(self): dev_dtypes = [("cpu", torch.float)] if TEST_CUDA: @@ -3544,6 +3590,7 @@ class TestNN(NNTestCase): # Very similar to test_Conv2d_naive_groups but with special care to handle # the number of groups == number of input channels @unittest.skipIf(not TEST_CUDA, 'CUDA not available') + @skipIfRocm @repeat_test_for_types(ALL_TENSORTYPES) def test_Conv2d_depthwise_naive_groups_cuda(self, dtype=torch.float): for depth_multiplier in [1, 2]: @@ -3891,6 +3938,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_variable_sequence_cuda(self, dtype=torch.float): self._test_variable_sequence("cuda", dtype) @@ -3908,6 +3956,7 @@ class TestNN(NNTestCase): (hx + cx).sum().backward() @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available') + @skipIfRocm def test_cudnn_rnn_dropout_states_device(self): rnn = nn.RNN(10, 20, num_layers=2, dropout=.5) device = 1 @@ -3917,6 +3966,7 @@ class TestNN(NNTestCase): output = rnn(input, hx) @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') + @skipIfRocm def test_cudnn_weight_format(self): rnns = [ nn.LSTM(10, 20, batch_first=True), @@ -3964,6 +4014,7 @@ class TestNN(NNTestCase): self.assertEqual(weight_data, all_vars[4].data) @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') + @skipIfRocm def test_cudnn_weight_tying(self): rnns = [ nn.LSTM(10, 20, batch_first=True, bidirectional=True), @@ -3997,6 +4048,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @repeat_test_for_types(NO_HALF_TENSORTYPES) + @skipIfRocm def test_cuda_rnn_fused(self, dtype=torch.float): def copy_rnn(rnn1, rnn2): @@ -4161,6 +4213,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_rnn_retain_variables_cuda(self, dtype=torch.float): with torch.backends.cudnn.flags(enabled=False): self._test_rnn_retain_variables("cuda", dtype) @@ -4308,6 +4361,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDNN, "needs cudnn") @default_tensor_type(torch.FloatTensor) # FIXME: just until torch.cuda.DoubleTensor.sum() implemented + @skipIfRocm def test_RNN_cpu_vs_cudnn_no_dropout(self): self._test_RNN_cpu_vs_cudnn(0) @@ -4462,6 +4516,7 @@ class TestNN(NNTestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @repeat_test_for_types(ALL_TENSORTYPES) + @skipIfRocm def test_noncontig_conv_grad_cuda(self, dtype=torch.float): # FIXME: remove after adding non-contiguous grad tests for all modules module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to("cuda", dtype) @@ -4661,6 +4716,7 @@ class TestNN(NNTestCase): gradgradcheck(func, [v]) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_batchnorm_cudnn_half(self): # THNN input = torch.randint(1, 10, (2, 3, 2, 2), dtype=torch.half, device="cuda", requires_grad=True) @@ -4708,6 +4764,7 @@ class TestNN(NNTestCase): self._test_batchnorm_update_stats() @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_batchnorm_update_stats_cuda(self): self._test_batchnorm_update_stats("cuda", torch.float) @@ -5002,6 +5059,7 @@ class TestNN(NNTestCase): with self.assertRaisesRegex(RuntimeError, "expected input and grid to be on same device"): F.grid_sample(input.cuda(), grid) + @skipIfRocm def test_grid_sample(self): def test(N, C, H, W, mode, padding_mode): def test_shape(N, C, IH, IW, H, W, mode, padding_mode): @@ -5149,6 +5207,7 @@ class TestNN(NNTestCase): with cudnn.flags(enabled=False): test(N, C, H, W, mode, padding_mode) + @skipIfRocm def test_grid_sample_3d(self): def test(N, C, D, H, W, mode, padding_mode): def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode): @@ -5262,6 +5321,7 @@ class TestNN(NNTestCase): test(N, C, D, H, W, mode, padding_mode) + @skipIfRocm def test_affine_grid(self): # test known input on CPU input = torch.arange(1., 7).view(1, 2, 3) @@ -5760,6 +5820,7 @@ class TestNN(NNTestCase): self._test_conv_noncontig_weights(self, torch.device('cpu')) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_conv_noncontig_weights_cuda(self): self._test_conv_noncontig_weights(self, torch.device('cuda')) @@ -5886,6 +5947,7 @@ class TestNN(NNTestCase): no_weight) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + @skipIfRocm def test_cudnn_noncontiguous_weight(self): # Noncontiguous weights must be contiguous() before being # passed to cuDNN @@ -6494,9 +6556,12 @@ def add_test(test, decorator=None): test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs)) add(cuda_test_name + '_double', lambda self, test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs)) + + @skipIfRocm + def test_half(self, test=test, kwargs=kwargs): + test.test_cuda(self, dtype=torch.half, **kwargs) if getattr(test, 'check_half', True): - add(cuda_test_name + '_half', lambda self, - test=test: test.test_cuda(self, dtype=torch.half, **kwargs)) + add(cuda_test_name + '_half', test_half) else: add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs)) @@ -6512,14 +6577,16 @@ new_criterion_tests = [ dict( module_name='BCEWithLogitsLoss', input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2), - target_fn=lambda: torch.randn(15, 10).gt(0).double() + target_fn=lambda: torch.randn(15, 10).gt(0).double(), + decorator=skipIfRocm, ), dict( module_name='BCEWithLogitsLoss', constructor_args=(torch.rand(10),), input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2), target_fn=lambda: torch.randn(15, 10).gt(0).double(), - desc='weights' + desc='weights', + decorator=skipIfRocm, ), dict( module_name='BCEWithLogitsLoss', @@ -6535,7 +6602,8 @@ new_criterion_tests = [ reference_fn=lambda i, t, m: loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)), check_sum_reduction=True, - desc='2d' + desc='2d', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='NLLLoss', @@ -6545,6 +6613,7 @@ new_criterion_tests = [ reference_fn=lambda i, t, m: loss_reference_fns['NLLLossNd'](i, t, weight=get_weight(m)), desc='2d_weights', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='NLLLoss', @@ -6554,6 +6623,7 @@ new_criterion_tests = [ reference_fn=lambda i, t, m: loss_reference_fns['NLLLossNd'](i, t, ignore_index=1), desc='2d_ignore_index', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='NLLLoss', @@ -6562,7 +6632,8 @@ new_criterion_tests = [ reference_fn=lambda i, t, m: loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)), check_sum_reduction=True, - desc='higher_dim' + desc='higher_dim', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='NLLLoss', @@ -6571,13 +6642,15 @@ new_criterion_tests = [ reference_fn=lambda i, t, m: loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)), check_sum_reduction=True, - desc='dim_is_3' + desc='dim_is_3', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='PoissonNLLLoss', input_size=(2, 3, 4, 5), target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(), desc='no_full_loss', # without sterling approx + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='PoissonNLLLoss', @@ -6585,6 +6658,7 @@ new_criterion_tests = [ input_fn=lambda: torch.randn(2, 3, 4, 5).abs_().add_(0.001), target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(), desc='full_loss', # with sterling approx + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='L1Loss', @@ -6657,6 +6731,7 @@ new_criterion_tests = [ desc='weights', check_sum_reduction=True, check_gradgrad=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='CTCLoss', @@ -6669,6 +6744,7 @@ new_criterion_tests = [ check_sum_reduction=True, check_gradgrad=False, check_half=False, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='CTCLoss', @@ -6682,6 +6758,7 @@ new_criterion_tests = [ check_sum_reduction=True, check_gradgrad=False, check_half=False, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='CTCLoss', @@ -6696,6 +6773,7 @@ new_criterion_tests = [ check_gradgrad=False, check_half=False, convert_target=False, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='CTCLoss', @@ -6710,6 +6788,7 @@ new_criterion_tests = [ check_gradgrad=False, check_half=False, convert_target=False, + test_cuda=(not TEST_WITH_ROCM), ), ] @@ -6721,7 +6800,7 @@ def poissonnllloss_no_reduce_test(): constructor=wrap_functional( lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none')), input_fn=lambda: torch.rand(10, 10), - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def bceloss_no_reduce_test(): @@ -6733,7 +6812,7 @@ def bceloss_no_reduce_test(): input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2), reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()), check_gradgrad=False, - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def bceloss_no_reduce_scalar_test(): @@ -6759,7 +6838,7 @@ def bceloss_weights_no_reduce_test(): input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2), reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights, check_gradgrad=False, - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def bceloss_weights_no_reduce_scalar_test(): @@ -6786,7 +6865,8 @@ def bce_with_logistic_no_reduce_test(): input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2), reference_fn=lambda i, m: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()), check_gradgrad=False, - pickle=False) + pickle=False, + decorator=skipIfRocm) def bce_with_logistic_no_reduce_scalar_test(): @@ -6799,7 +6879,8 @@ def bce_with_logistic_no_reduce_scalar_test(): input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2), reference_fn=lambda i, m: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()), check_gradgrad=False, - pickle=False) + pickle=False, + decorator=skipIfRocm) def kldivloss_with_target_no_reduce_test(): @@ -6811,7 +6892,7 @@ def kldivloss_with_target_no_reduce_test(): input_fn=lambda: torch.rand(10, 10), reference_fn=lambda t, _: loss_reference_fns['KLDivLoss'](i.type_as(t), t, reduction='none'), - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def kldivloss_no_reduce_test(): @@ -6823,7 +6904,8 @@ def kldivloss_no_reduce_test(): input_fn=lambda: torch.rand(10, 10).log(), reference_fn=lambda i, _: loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'), - pickle=False) + pickle=False, + decorator=skipIfRocm) def kldivloss_no_reduce_scalar_test(): @@ -6846,7 +6928,7 @@ def l1loss_no_reduce_test(): lambda i: F.l1_loss(i, t.type_as(i), reduction='none')), input_fn=lambda: torch.randn(2, 3, 4), reference_fn=lambda i, m: (i - t.type_as(i)).abs(), - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def l1loss_no_reduce_scalar_test(): @@ -7058,7 +7140,7 @@ def smoothl1loss_no_reduce_test(): input_fn=lambda: torch.randn(2, 3, 4), reference_fn=lambda i, _: loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'), - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def smoothl1loss_no_reduce_scalar_test(): @@ -7150,7 +7232,7 @@ def softmarginloss_no_reduce_test(): input_fn=lambda: torch.randn(5, 5), reference_fn=lambda i, _: loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'), - pickle=False) + pickle=False, test_cuda=(not TEST_WITH_ROCM)) def multilabelsoftmarginloss_no_reduce_test(): @@ -7163,7 +7245,7 @@ def multilabelsoftmarginloss_no_reduce_test(): reference_fn=lambda i, m: (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log())).sum(dim=1) / i.size(1), check_gradgrad=False, - pickle=False) + pickle=False, decorator=skipIfRocm) def multilabelsoftmarginloss_weights_no_reduce_test(): @@ -7179,7 +7261,7 @@ def multilabelsoftmarginloss_weights_no_reduce_test(): (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights).sum(dim=1) / i.size(1), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, decorator=skipIfRocm) def multimarginloss_no_reduce_test(): @@ -7304,6 +7386,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='affine', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm1d', @@ -7312,6 +7396,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='3d_input', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm1d', @@ -7320,6 +7406,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='affine_simple_average', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm1d', @@ -7328,6 +7416,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='not_affine', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm1d', @@ -7336,6 +7426,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='not_tracking_stats', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm1d', @@ -7344,6 +7436,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='3d_input_not_affine', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm2d', @@ -7351,6 +7445,8 @@ new_module_tests = [ input_size=(2, 3, 6, 6), cudnn=True, check_eval=True, + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm2d', @@ -7359,6 +7455,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='2d_simple_average', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm2d', @@ -7367,6 +7465,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='momentum', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm2d', @@ -7375,6 +7475,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='not_affine', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm2d', @@ -7383,6 +7485,8 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='not_tracking_stats', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='BatchNorm3d', @@ -7390,6 +7494,7 @@ new_module_tests = [ input_size=(2, 3, 4, 4, 4), cudnn=True, check_eval=True, + decorator=skipIfRocm, ), dict( module_name='BatchNorm3d', @@ -7398,6 +7503,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='3d_simple_average', + decorator=skipIfRocm, ), dict( module_name='BatchNorm3d', @@ -7406,6 +7512,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='momentum', + decorator=skipIfRocm, ), dict( module_name='BatchNorm3d', @@ -7414,6 +7521,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='not_affine', + decorator=skipIfRocm, ), dict( module_name='BatchNorm3d', @@ -7422,6 +7530,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='not_tracking_stats', + decorator=skipIfRocm, ), dict( module_name='InstanceNorm1d', @@ -7429,6 +7538,7 @@ new_module_tests = [ input_size=(4, 3, 15), cudnn=True, check_eval=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='InstanceNorm1d', @@ -7437,6 +7547,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='tracking_stats', + decorator=skipIfRocm ), dict( module_name='InstanceNorm2d', @@ -7444,6 +7555,7 @@ new_module_tests = [ input_size=(2, 3, 6, 6), cudnn=True, check_eval=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='InstanceNorm2d', @@ -7452,6 +7564,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='tracking_stats', + decorator=skipIfRocm ), dict( module_name='InstanceNorm3d', @@ -7459,6 +7572,7 @@ new_module_tests = [ input_size=(2, 3, 4, 4, 4), cudnn=True, check_eval=True, + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='InstanceNorm3d', @@ -7467,6 +7581,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='tracking_stats', + decorator=skipIfRocm ), dict( module_name='LayerNorm', @@ -7475,6 +7590,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='1d_elementwise_affine', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LayerNorm', @@ -7483,6 +7599,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='1d_no_elementwise_affine', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LayerNorm', @@ -7491,6 +7608,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='3d_elementwise_affine', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LayerNorm', @@ -7499,6 +7617,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='3d_no_elementwise_affine', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GroupNorm', @@ -7507,6 +7626,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='1d_affine', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GroupNorm', @@ -7514,7 +7634,8 @@ new_module_tests = [ input_size=(4, 5, 5), cudnn=True, check_eval=True, - desc='1d_no_affine_IN', # this setting is equivalent with InstanceNorm + desc='1d_no_affine_IN', # this setting is equivalent with InstanceNormi + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GroupNorm', @@ -7523,6 +7644,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='1d_no_affine_LN', # this setting is equivalent with LayerNorm + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GroupNorm', @@ -7531,6 +7653,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='2d_affine', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GroupNorm', @@ -7539,6 +7662,7 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='2d_no_affine_IN', # this setting is equivalent with InstanceNorm + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GroupNorm', @@ -7547,12 +7671,15 @@ new_module_tests = [ cudnn=True, check_eval=True, desc='2d_no_affine_LN', # this setting is equivalent with LayerNorm + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='Conv1d', constructor_args=(4, 5, 3), input_size=(2, 4, 10), cudnn=True, + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv1d', @@ -7560,51 +7687,66 @@ new_module_tests = [ input_size=(2, 4, 10), cudnn=True, desc='stride', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv1d', constructor_args=(4, 5, 3, 1, 1), input_size=(2, 4, 10), cudnn=True, - desc='pad1' + desc='pad1', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv1d', constructor_args=(4, 5, 5, 1, 2), input_size=(2, 4, 10), cudnn=True, - desc='pad2' + desc='pad2', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv1d', constructor_args=(4, 4, 3, 1, 1), input_size=(1, 4, 1), cudnn=True, - desc='pad1size1' + desc='pad1size1', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv1d', constructor_args=(4, 4, 5, 1, 2), input_size=(1, 4, 1), cudnn=True, - desc='pad2size1' + desc='pad2size1', + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv1d_dilated', constructor=lambda: nn.Conv1d(4, 5, kernel_size=3, dilation=2), input_size=(2, 4, 10), + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv1d_groups', constructor=lambda: nn.Conv1d(4, 6, kernel_size=3, groups=2), input_size=(2, 4, 6), cudnn=True, + skip_double=TEST_WITH_ROCM, + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='ConvTranspose1d', constructor=lambda: nn.ConvTranspose1d(3, 4, kernel_size=3, stride=(3,), padding=1, output_padding=(1,)), cudnn=True, input_size=(1, 3, 7), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='ConvTranspose1d', @@ -7612,6 +7754,7 @@ new_module_tests = [ input_size=(1, 3, 6), cudnn=True, desc='no_bias', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='ConvTranspose1d', @@ -7619,12 +7762,14 @@ new_module_tests = [ input_size=(1, 3, 6), cudnn=True, desc='dilated', + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='ConvTranspose1d_groups', constructor=lambda: nn.ConvTranspose1d(4, 6, 3, stride=(3,), padding=1, output_padding=(1,), groups=2), cudnn=True, input_size=(2, 4, 7), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='MaxPool1d', @@ -7642,6 +7787,7 @@ new_module_tests = [ constructor_args=(3, 4, (3, 2)), input_size=(2, 3, 7, 5), cudnn=True, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv2d', @@ -7649,6 +7795,7 @@ new_module_tests = [ input_size=(2, 3, 6, 6), cudnn=True, desc='strided', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv2d', @@ -7656,6 +7803,7 @@ new_module_tests = [ input_size=(2, 3, 6, 6), cudnn=True, desc='padding', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv2d', @@ -7663,6 +7811,7 @@ new_module_tests = [ input_size=(2, 3, 8, 8), cudnn=True, desc='dilated', + decorator=skipIfRocm, ), dict( module_name='Conv2d', @@ -7670,23 +7819,27 @@ new_module_tests = [ input_size=(2, 3, 6, 5), cudnn=True, desc='no_bias', + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv2d_groups', constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2), input_size=(2, 4, 6, 5), cudnn=True, + decorator=skipIfRocm, ), dict( fullname='Conv2d_groups_thnn', constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2), input_size=(2, 4, 6, 5), + decorator=skipIfRocm, ), dict( module_name='ConvTranspose2d', constructor_args=(3, 4, 3, (3, 2), 1, (1, 1)), cudnn=True, input_size=(1, 3, 7, 6), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='ConvTranspose2d', @@ -7694,6 +7847,7 @@ new_module_tests = [ input_size=(1, 3, 6, 7), cudnn=True, desc='dilated', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='ConvTranspose2d', @@ -7701,37 +7855,44 @@ new_module_tests = [ input_size=(1, 3, 6, 7), cudnn=True, desc='no_bias', + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='ConvTranspose2d_groups', constructor=lambda: nn.ConvTranspose2d(2, 4, (2, 3), groups=2), input_size=(1, 2, 4, 5), cudnn=True, + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv2d_depthwise', constructor=lambda: nn.Conv2d(4, 4, (3, 3), groups=4), input_size=(2, 4, 6, 6), + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv2d_depthwise_with_multiplier', constructor=lambda: nn.Conv2d(4, 8, (3, 3), groups=4), input_size=(2, 4, 6, 6), + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv2d_depthwise_strided', constructor=lambda: nn.Conv2d(4, 4, (3, 3), stride=(2, 2), groups=4), input_size=(2, 4, 6, 6), + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv2d_depthwise_padded', constructor=lambda: nn.Conv2d(4, 4, (3, 3), padding=(1, 1), groups=4), input_size=(2, 4, 6, 6), + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv2d_depthwise_dilated', constructor=lambda: nn.Conv2d(4, 4, (2, 2), dilation=(2, 2), groups=4), input_size=(2, 4, 5, 5), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='MaxPool2d', @@ -7798,19 +7959,22 @@ new_module_tests = [ module_name='LocalResponseNorm', constructor_args=(3, ), input_size=(1, 5, 7), - desc='1d' + desc='1d', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LocalResponseNorm', constructor_args=(2, ), input_size=(1, 5, 7, 7), - desc='2d_uneven_pad' + desc='2d_uneven_pad', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='LocalResponseNorm', constructor_args=(1, 1, 0.5, 2), input_size=(1, 5, 7, 7, 7), - desc='3d_custom_params' + desc='3d_custom_params', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='ReflectionPad1d', @@ -7863,6 +8027,7 @@ new_module_tests = [ constructor_args=(3, 4, (2, 3, 4)), input_size=(2, 3, 3, 4, 5), cudnn=True, + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv3d', @@ -7870,6 +8035,7 @@ new_module_tests = [ input_size=(2, 3, 3, 4, 5), cudnn=True, desc='no_bias', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv3d', @@ -7877,6 +8043,7 @@ new_module_tests = [ input_size=(2, 3, 5, 5, 5), cudnn=True, desc='stride', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='Conv3d', @@ -7884,28 +8051,33 @@ new_module_tests = [ input_size=(2, 3, 5, 5, 5), cudnn=True, desc='stride_padding', + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv3d_groups', constructor=lambda: nn.Conv3d(4, 6, kernel_size=3, groups=2), input_size=(2, 4, 4, 5, 4), cudnn=True, + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv3d_dilated', constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2), input_size=(2, 3, 5, 5, 5), + test_cuda=(not TEST_WITH_ROCM), ), dict( fullname='Conv3d_dilated_strided', constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2, stride=2), input_size=(2, 3, 5, 5, 5), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='ConvTranspose3d', constructor_args=(2, 3, (2, 3, 2)), cudnn=True, input_size=(1, 2, 4, 5, 4), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='ConvTranspose3d', @@ -7913,6 +8085,7 @@ new_module_tests = [ cudnn=True, input_size=(1, 2, 4, 5, 4), desc='dilated', + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='MaxPool3d', @@ -7991,6 +8164,8 @@ new_module_tests = [ jacobian_input=False, check_gradgrad=False, desc='mean', + test_cuda=(not TEST_WITH_ROCM), + decorator=skipIfRocm ), dict( module_name='EmbeddingBag', @@ -7999,6 +8174,8 @@ new_module_tests = [ jacobian_input=False, check_gradgrad=False, desc='sum', + test_cuda=(not TEST_WITH_ROCM), + decorator=skipIfRocm ), dict( module_name='EmbeddingBag', @@ -8014,6 +8191,8 @@ new_module_tests = [ input_fn=lambda: torch.randperm(2).repeat(1, 2), jacobian_input=False, check_gradgrad=False, + decorator=skipIfRocm, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=lambda: nn.Embedding(4, 3, sparse=True), @@ -8021,6 +8200,7 @@ new_module_tests = [ jacobian_input=False, fullname='Embedding_sparse', check_gradgrad=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=lambda: nn.FractionalMaxPool2d( @@ -8310,7 +8490,8 @@ new_module_tests = [ input_size=(3, 2, 5), constructor_args=(2.,), check_inplace=True, - reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)) + reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)), + test_cuda=(not TEST_WITH_ROCM), ), dict( module_name='CELU', @@ -8323,30 +8504,35 @@ new_module_tests = [ dict( module_name='GLU', input_size=(5, 6), + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='GLU', constructor_args=(1,), input_size=(5, 6, 7), - desc='dim' + desc='dim', + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.softmax, dim=-1), input_size=(2, 128), # trigger the last-dim algo in CUDA fullname='softmax_lastdim', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.softmax, dim=1), input_size=(2, 128, 2, 2), # trigger special case of spatial CUDA algo fullname='softmax_spatial_special', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.softmax, dim=1), input_size=(2, 2, 4, 4), # regular spatial algorithm fullname='softmax_spatial', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.softmax, dim=0), @@ -8374,30 +8560,35 @@ new_module_tests = [ input_size=(2, 128), # trigger the last-dim algo in CUDA fullname='log_softmax_lastdim', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.log_softmax, dim=1), input_size=(2, 128, 2, 2), # trigger special case of spatial CUDA algo fullname='log_softmax_spatial_special', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.log_softmax, dim=1), input_size=(2, 2, 4, 4), # regular spatial algorithm fullname='log_softmax_spatial', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.log_softmax, dim=0), input_size=(2, 3, 4, 5), fullname='log_softmax_dim0', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.log_softmax, dim=3), input_size=(2, 3, 4, 5), fullname='log_softmax_dim3', pickle=False, + test_cuda=(not TEST_WITH_ROCM) ), dict( constructor=wrap_functional(F.log_softmax, dim=0), @@ -8489,6 +8680,7 @@ new_module_tests = [ input_size=(), reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(0, False)).log_(), desc='multiparam_scalar', + test_cuda=(not TEST_WITH_ROCM) ), dict( module_name='ELU', @@ -8638,11 +8830,10 @@ class _AdaptiveLogSoftmaxWithLoss(nn.AdaptiveLogSoftmaxWithLoss): t = torch.tensor([0, 1, 4, 8]).to(input.device) return nn.AdaptiveLogSoftmaxWithLoss.__call__(self, input, t).output - add_test(NewModuleTest( constructor=lambda: _AdaptiveLogSoftmaxWithLoss(16, 10, [2, 6]), input_size=(4, 16), - fullname='AdaptiveLogSoftmax')) + fullname='AdaptiveLogSoftmax'), decorator=skipIfRocm) # The following are helpers for TestNN.test_affine_* diff --git a/test/test_torch.py b/test/test_torch.py index 8592e0b03e3..5747407f86a 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -3444,7 +3444,6 @@ class TestTorch(TestCase): self.assertRaises(TypeError, lambda: q.topk(4, True)) @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA') - @skipIfRocm def test_topk_noncontiguous_gpu(self): t = torch.randn(20, device="cuda")[::2] top1, idx1 = t.topk(5) diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml index 55f327243f0..74eb2669300 100644 --- a/tools/amd_build/disabled_features.yaml +++ b/tools/amd_build/disabled_features.yaml @@ -75,6 +75,7 @@ "struct curandStateMtgp32*": "curandStateMtgp32*", "__host__ void THCRandom_getRNGState": "extern \"C\" __host__ void THCRandom_getRNGState", "__host__ void THCRandom_setRNGState": "extern \"C\" __host__ void THCRandom_setRNGState", + "state[threadIdx.x].k = kernel;" : "state[threadIdx.x].set_params(kernel);" } }, { @@ -96,27 +97,6 @@ "struct mtgp32_kernel_params": "mtgp32_kernel_params" } }, - { - "path": "aten/src/ATen/native/cuda/CuFFTUtils.h", - "s_constants": { - "#include ": "", - "#include ": "" - } - }, - { - "path": "aten/src/ATen/native/cuda/CuFFTPlanCache.h", - "s_constants": { - "#include ": "", - "#include ": "" - } - }, - { - "path": "aten/src/ATen/native/cuda/SpectralOps.cu", - "s_constants": { - "#include ": "", - "#include ": "" - } - }, { "path": "aten/src/ATen/native/cuda/RoiPooling.cu", "s_constants": { @@ -141,9 +121,6 @@ } ], "disabled_modules": [ - "aten/src/ATen/native/cuda/CuFFTUtils.h", - "aten/src/ATen/native/cuda/CuFFTPlanCache.h", - "aten/src/ATen/native/cuda/SpectralOps.cu", ], "disabled_functions": [ { @@ -205,13 +182,6 @@ "functions": [ "THCTensor_(getTextureObject)" ] - }, - { - "path": "aten/src/THC/THCTensorRandom.cu", - "functions": [ - "THCRandom_setRNGState", - "set_rngstate_kernel" - ] } ] } diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py index 092de16cff7..9dc4b5ee348 100644 --- a/tools/amd_build/pyHIPIFY/constants.py +++ b/tools/amd_build/pyHIPIFY/constants.py @@ -50,7 +50,8 @@ API_BLAS = 39 API_SPARSE = 40 API_RAND = 41 API_LAST = 42 +API_FFT = 43 HIP_UNSUPPORTED = 43 API_PYTORCH = 1337 -API_CAFFE2 = 1338 \ No newline at end of file +API_CAFFE2 = 1338 diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index b1ace1b44d1..6bf931c5893 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -271,8 +271,8 @@ CUDA_INCLUDE_MAP = { "curand_precalc.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND), "curand_uniform.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND), "cusparse.h": ("hipsparse.h", CONV_INCLUDE, API_RAND), - "#include ": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED), - "#include ": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED), + "cufft.h": ("hipfft.h", CONV_INCLUDE, API_BLAS), + "cufftXt.h": ("hipfft.h", CONV_INCLUDE, API_BLAS), "#include ": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED), } @@ -2095,7 +2095,77 @@ CUDA_IDENTIFIER_MAP = { "curand_poisson": ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND), "curand_poisson4": ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND), "curand_Philox4x32_10": ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED), - "mtgp32_kernel_params": ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND) + "mtgp32_kernel_params": ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND), + "CUFFT_FORWARD": ("HIPFFT_FORWARD", CONV_NUMERIC_LITERAL, API_BLAS), + "CUFFT_INVERSE": ("HIPFFT_BACKWARD", CONV_NUMERIC_LITERAL, API_BLAS), + "CUFFT_COMPATIBILITY_DEFAULT": ("HIPFFT_COMPATIBILITY_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED), + "cufftResult_t": ("hipfftResult_t", CONV_TYPE, API_FFT), + "cufftResult": ("hipfftResult", CONV_TYPE, API_FFT), + "CUFFT_SUCCESS": ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INVALID_PLAN": ("HIPFFT_INVALID_PLAN", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_ALLOC_FAILED": ("HIPFFT_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INVALID_TYPE": ("HIPFFT_INVALID_TYPE", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INVALID_VALUE": ("HIPFFT_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INTERNAL_ERROR": ("HIPFFT_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_EXEC_FAILED": ("HIPFFT_EXEC_FAILED", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_SETUP_FAILED": ("HIPFFT_SETUP_FAILED", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INVALID_SIZE": ("HIPFFT_INVALID_SIZE", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_UNALIGNED_DATA": ("HIPFFT_UNALIGNED_DATA", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INCOMPLETE_PARAMETER_LIST": ("HIPFFT_INCOMPLETE_PARAMETER_LIST", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_INVALID_DEVICE": ("HIPFFT_INVALID_DEVICE", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_PARSE_ERROR": ("HIPFFT_PARSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_NO_WORKSPACE": ("HIPFFT_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_NOT_IMPLEMENTED": ("HIPFFT_NOT_IMPLEMENTED", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_LICENSE_ERROR": ("HIPFFT_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED), + "CUFFT_NOT_SUPPORTED": ("HIPFFT_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_FFT), + "cufftType_t": ("hipfftType_t", CONV_TYPE, API_FFT), + "cufftType": ("hipfftType", CONV_TYPE, API_FFT), + "CUFFT_R2C": ("HIPFFT_R2C", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_C2R": ("HIPFFT_C2R", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_C2C": ("HIPFFT_C2C", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_D2Z": ("HIPFFT_D2Z", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_Z2D": ("HIPFFT_Z2D", CONV_NUMERIC_LITERAL, API_FFT), + "CUFFT_Z2Z": ("HIPFFT_Z2Z", CONV_NUMERIC_LITERAL, API_FFT), + "cufftCompatibility_t": ("hipfftCompatibility_t", CONV_TYPE, API_FFT, HIP_UNSUPPORTED), + "cufftCompatibility": ("hipfftCompatibility", CONV_TYPE, API_FFT, HIP_UNSUPPORTED), + "CUFFT_COMPATIBILITY_FFTW_PADDING": ("HIPFFT_COMPATIBILITY_FFTW_PADDING", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED), + "cufftReal": ("hipfftReal", CONV_TYPE, API_FFT), + "cufftDoubleReal": ("hipfftDoubleReal", CONV_TYPE, API_FFT), + "cufftComplex": ("hipfftComplex", CONV_TYPE, API_FFT), + "cufftDoubleComplex": ("hipfftDoubleComplex", CONV_TYPE, API_FFT), + "cufftHandle": ("hipfftHandle", CONV_TYPE, API_FFT), + "cufftPlan1d": ("hipfftPlan1d", CONV_MATH_FUNC, API_FFT), + "cufftPlan2d": ("hipfftPlan2d", CONV_MATH_FUNC, API_FFT), + "cufftPlan3d": ("hipfftPlan3d", CONV_MATH_FUNC, API_FFT), + "cufftPlanMany": ("hipfftPlanMany", CONV_MATH_FUNC, API_FFT), + "cufftMakePlan1d": ("hipfftMakePlan1d", CONV_MATH_FUNC, API_FFT), + "cufftMakePlan2d": ("hipfftMakePlan2d", CONV_MATH_FUNC, API_FFT), + "cufftMakePlan3d": ("hipfftMakePlan3d", CONV_MATH_FUNC, API_FFT), + "cufftMakePlanMany": ("hipfftMakePlanMany", CONV_MATH_FUNC, API_FFT), + "cufftMakePlanMany64": ("hipfftMakePlanMany64", CONV_MATH_FUNC, API_FFT), + "cufftGetSizeMany64": ("hipfftGetSizeMany64", CONV_MATH_FUNC, API_FFT), + "cufftEstimate1d": ("hipfftEstimate1d", CONV_MATH_FUNC, API_FFT), + "cufftEstimate2d": ("hipfftEstimate2d", CONV_MATH_FUNC, API_FFT), + "cufftEstimate3d": ("hipfftEstimate3d", CONV_MATH_FUNC, API_FFT), + "cufftEstimateMany": ("hipfftEstimateMany", CONV_MATH_FUNC, API_FFT), + "cufftCreate": ("hipfftCreate", CONV_MATH_FUNC, API_FFT), + "cufftGetSize1d": ("hipfftGetSize1d", CONV_MATH_FUNC, API_FFT), + "cufftGetSize2d": ("hipfftGetSize2d", CONV_MATH_FUNC, API_FFT), + "cufftGetSize3d": ("hipfftGetSize3d", CONV_MATH_FUNC, API_FFT), + "cufftGetSizeMany": ("hipfftGetSizeMany", CONV_MATH_FUNC, API_FFT), + "cufftGetSize": ("hipfftGetSize", CONV_MATH_FUNC, API_FFT), + "cufftSetWorkArea": ("hipfftSetWorkArea", CONV_MATH_FUNC, API_FFT), + "cufftSetAutoAllocation": ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT), + "cufftExecC2C": ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT), + "cufftExecR2C": ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT), + "cufftExecC2R": ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT), + "cufftExecZ2Z": ("hipfftExecZ2Z", CONV_MATH_FUNC, API_FFT), + "cufftExecD2Z": ("hipfftExecD2Z", CONV_MATH_FUNC, API_FFT), + "cufftExecZ2D": ("hipfftExecZ2D", CONV_MATH_FUNC, API_FFT), + "cufftSetStream": ("hipfftSetStream", CONV_MATH_FUNC, API_FFT), + "cufftDestroy": ("hipfftDestroy", CONV_MATH_FUNC, API_FFT), + "cufftGetVersion": ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT), + "cufftGetProperty": ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED), } CUDA_SPARSE_MAP = { diff --git a/tools/amd_build/pyHIPIFY/hipify-python.py b/tools/amd_build/pyHIPIFY/hipify-python.py index 2e2c44d4c36..8fec20a1aae 100755 --- a/tools/amd_build/pyHIPIFY/hipify-python.py +++ b/tools/amd_build/pyHIPIFY/hipify-python.py @@ -760,8 +760,8 @@ def preprocessor(filepath, stats, hipify_caffe2): output_source = processKernelLaunches(output_source, stats) # Disable asserts - if not filepath.endswith("THCGeneral.h.in"): - output_source = disable_asserts(output_source) + # if not filepath.endswith("THCGeneral.h.in"): + # output_source = disable_asserts(output_source) # Replace std:: with non-std:: versions output_source = replace_math_functions(output_source)