diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index fdc95e54600..37c8f3a364f 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -90,8 +90,13 @@ public:
     IntList output_sizes) {
 
     // signal sizes
+#ifdef __HIP_PLATFORM_HCC__
+    std::vector<int> signal_sizes(checked_signal_sizes.begin(),
+                                  checked_signal_sizes.end());
+#else
     std::vector<long long int> signal_sizes(checked_signal_sizes.begin(),
                                             checked_signal_sizes.end());
+#endif
 
     // input batch size
     long long int batch = input.size(0);
@@ -149,7 +154,11 @@ public:
     // TODO: Figure out why windows fails to compile
     //         at::optional<std::vector<long long int>> inembed_opt = at::nullopt;
     //       Then move the following to a helper function.
+#ifdef __HIP_PLATFORM_HCC__
+    std::vector<int> inembed(signal_ndim);
+#else
     std::vector<long long int> inembed(signal_ndim);
+#endif
     if (!clone_input) {
       auto istrides = input.strides();
       auto last_istride = istrides[signal_ndim];
@@ -192,6 +201,37 @@ public:
                 inembed.begin());                      // begin of output
     }
 
+#ifdef __HIP_PLATFORM_HCC__
+
+    hipfftType exec_type;
+    if (input.type().scalarType() == ScalarType::Float) {
+      if (complex_input && complex_output) {
+        exec_type = HIPFFT_C2C;
+      } else if (complex_input && !complex_output) {
+        exec_type = HIPFFT_C2R;
+      } else if (!complex_input && complex_output) {
+        exec_type = HIPFFT_R2C;
+      } else {
+        throw std::runtime_error("hipFFT doesn't support r2r (float)");
+      }
+    } else if (input.type().scalarType() == ScalarType::Double) {
+      if (complex_input && complex_output) {
+        exec_type = HIPFFT_Z2Z;
+      } else if (complex_input && !complex_output) {
+        exec_type = HIPFFT_Z2D;
+      } else if (!complex_input && complex_output) {
+        exec_type = HIPFFT_D2Z;
+      } else {
+        throw std::runtime_error("hipFFT doesn't support r2r (double)");
+      }
+    } else {
+      std::ostringstream ss;
+      ss << "hipFFT doesn't support tensor of type: "
+         << at::toString(input.type().scalarType());
+      throw std::runtime_error(ss.str());
+    }
+
+#else
     cudaDataType itype, otype, exec_type;
     if (input.type().scalarType() == ScalarType::Float) {
       itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
@@ -211,6 +251,7 @@ public:
          << at::toString(input.type().scalarType());
       throw std::runtime_error(ss.str());
     }
+#endif
 
     // create plan
     auto raw_plan_ptr = new cufftHandle();
@@ -229,10 +270,17 @@ public:
       // by assuming base_istride = base_ostride = 1.
       //
       // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+#ifdef __HIP_PLATFORM_HCC__
+      CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
+	exec_type, batch, &ws_size_t));
+#else
       CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
         batch, &ws_size_t, exec_type));
+#endif
     } else {
       // set idist (stride at batch dim)
       // set base_istride (stride at innermost dim of signal)
@@ -254,6 +302,18 @@ public:
       }
 
       // set odist, onembed, base_ostride
+#ifdef __HIP_PLATFORM_HCC__
+      int odist = at::prod_intlist(output_sizes.slice(1, signal_ndim));
+      std::vector<int> onembed(output_sizes.data() + 1, output_sizes.data() + signal_ndim + 1);
+      int base_ostride = 1;
+
+      int istride = base_istride;
+      int iidist = idist;
+      CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+        inembed.data(), istride, iidist,
+        onembed.data(), base_ostride, odist,
+        exec_type, batch, &ws_size_t));
+#else
       long long int odist = at::prod_intlist(output_sizes.slice(1, signal_ndim));
       std::vector<long long int> onembed(output_sizes.data() + 1, output_sizes.data() + signal_ndim + 1);
       long long int base_ostride = 1;
@@ -262,11 +322,16 @@ public:
             inembed.data(), base_istride, idist, itype,
             onembed.data(), base_ostride, odist, otype,
             batch, &ws_size_t, exec_type));
-    }
+#endif
+      }
     ws_size = static_cast<int64_t>(ws_size_t);
   }
 
+#ifdef __HIP_PLATFORM_HCC__
+  cufftHandle &plan() const { return *plan_ptr.get(); }
+#else
   const cufftHandle &plan() const { return *plan_ptr.get(); }
+#endif
 
   bool should_clone_input() const { return clone_input; }
 
diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h
index 5edfcbc1354..8046602eb59 100644
--- a/aten/src/ATen/native/cuda/CuFFTUtils.h
+++ b/aten/src/ATen/native/cuda/CuFFTUtils.h
@@ -49,8 +49,10 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
       return "CUFFT_NO_WORKSPACE";
     case CUFFT_NOT_IMPLEMENTED:
       return "CUFFT_NOT_IMPLEMENTED";
+#ifndef __HIP_PLATFORM_HCC__
     case CUFFT_LICENSE_ERROR:
       return "CUFFT_LICENSE_ERROR";
+#endif
     case CUFFT_NOT_SUPPORTED:
       return "CUFFT_NOT_SUPPORTED";
     default:
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index 11c648fd6ae..00057d403dc 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -189,8 +189,45 @@ static inline Tensor _run_cufft(
   CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr()));
 
   // run
+#ifdef __HIP_PLATFORM_HCC__
+  if (input.type().scalarType() == ScalarType::Float) {
+      if (complex_input && complex_output) {
+        CUFFT_CHECK(hipfftExecC2C(plan, static_cast<hipfftComplex*>(input.data_ptr()),
+          static_cast<hipfftComplex*>(output.data_ptr()),
+          inverse ? HIPFFT_BACKWARD : HIPFFT_FORWARD));
+      } else if (complex_input && !complex_output) {
+        CUFFT_CHECK(hipfftExecC2R(plan, static_cast<hipfftComplex*>(input.data_ptr()),
+          static_cast<hipfftReal*>(output.data_ptr())));
+      } else if (!complex_input && complex_output) {
+        CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(input.data_ptr()),
+          static_cast<hipfftComplex*>(output.data_ptr())));
+      } else {
+        throw std::runtime_error("hipFFT doesn't support r2r (float)");
+      }
+    } else if (input.type().scalarType() == ScalarType::Double) {
+      if (complex_input && complex_output) {
+        CUFFT_CHECK(hipfftExecZ2Z(plan, static_cast<hipfftDoubleComplex*>(input.data_ptr()),
+          static_cast<hipfftDoubleComplex*>(output.data_ptr()),
+          inverse ? HIPFFT_BACKWARD : HIPFFT_FORWARD));
+      } else if (complex_input && !complex_output) {
+        CUFFT_CHECK(hipfftExecZ2D(plan, static_cast<hipfftDoubleComplex*>(input.data_ptr()),
+          static_cast<hipfftDoubleReal*>(output.data_ptr())));
+      } else if (!complex_input && complex_output) {
+        CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(input.data_ptr()),
+          static_cast<hipfftDoubleComplex*>(output.data_ptr())));
+      } else {
+        throw std::runtime_error("hipFFT doesn't support r2r (double)");
+      }
+    } else {
+      std::ostringstream ss;
+      ss << "hipFFT doesn't support tensor of type: "
+         << at::toString(input.type().scalarType());
+      throw std::runtime_error(ss.str());
+    }
+#else
   CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(),
     inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+#endif
 
   // rescale if needed by normalized flag or inverse transform
   auto size_last_signal_dim = checked_signal_sizes[signal_ndim - 1];
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 485b5744f15..8fec96dde6f 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -138,8 +138,10 @@ static inline  __device__  void atomicAdd(double *address, double val) {
 } while (assumed != old);
 }
 #elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
+#if defined(__HIP_PLATFORM_HCC__) && __hcc_workweek__ < 18312
   // This needs to be defined for the host side pass
   static inline  __device__  void atomicAdd(double *address, double val) { }
 #endif
+#endif
 
 #endif // THC_ATOMICS_INC
diff --git a/aten/src/THC/THCScanUtils.cuh b/aten/src/THC/THCScanUtils.cuh
index ef7c297f41c..d5542383560 100644
--- a/aten/src/THC/THCScanUtils.cuh
+++ b/aten/src/THC/THCScanUtils.cuh
@@ -4,6 +4,12 @@
 #include "THCAsmUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
+#if defined(__HIP_PLATFORM_HCC__)
+#define SCAN_UTILS_WARP_SIZE 64
+#else
+#define SCAN_UTILS_WARP_SIZE 32
+#endif
+
 // Collection of in-kernel scan / prefix sum utilities
 
 // Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
@@ -157,7 +163,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti
   T index = __popc(getLaneMaskLe() & vote);
   T carry = __popc(vote);
 
-  int warp = threadIdx.x / 32;
+  int warp = threadIdx.x / SCAN_UTILS_WARP_SIZE;
 
   // Per each warp, write out a value
   if (getLaneId() == 0) {
@@ -170,7 +176,7 @@ __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFuncti
   // warp shuffle scan for CC 3.0+
   if (threadIdx.x == 0) {
     int current = 0;
-    for (int i = 0; i < blockDim.x / 32; ++i) {
+    for (int i = 0; i < blockDim.x / SCAN_UTILS_WARP_SIZE; ++i) {
       T v = smem[i];
       smem[i] = binop(smem[i], current);
       current = binop(current, v);
@@ -201,11 +207,13 @@ __device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, Bi
   *out -= (T) in;
 
   // The outgoing carry for all threads is the last warp's sum
-  *carry = smem[(blockDim.x / 32) - 1];
+  *carry = smem[(blockDim.x / SCAN_UTILS_WARP_SIZE) - 1];
 
   if (KillWARDependency) {
     __syncthreads();
   }
 }
 
+#undef SCAN_UTILS_WARP_SIZE
+
 #endif // THC_SCAN_UTILS_INC
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
index 2ddff119c64..4f7a6b8c697 100644
--- a/aten/src/THC/THCTensorTopK.cuh
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -213,7 +213,11 @@ __device__ DataType findPattern(DataType* smem,
                              IndexType withinSliceStride,
                              BitDataType desired,
                              BitDataType desiredMask) {
+#ifdef __HIP_PLATFORM_HCC__
+  if (threadIdx.x < 64) { 
+#else
   if (threadIdx.x < 32) {
+#endif
     smem[threadIdx.x] = ScalarConvert<int, DataType>::to(0);
   }
   __syncthreads();
@@ -366,7 +370,11 @@ __global__ void gatherTopK(TensorInfo<T, IndexType> input,
                            IndexType indicesWithinSliceStride) {
   // Indices are limited to integer fp precision, so counts can fit in
   // int32, regardless of IndexType
+#ifdef __HIP_PLATFORM_HCC__
+  __shared__ int smem[64];
+#else
   __shared__ int smem[32]; // one per each warp, up to warp limit
+#endif
 
   IndexType slice = getLinearBlockId<IndexType>();
   if (slice >= numInputSlices) {
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index 284b9331230..9fdb13f1469 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -29,21 +29,24 @@ THC_API void THCTensor_(topk)(THCState* state,
   THCTensor_(resize)(state, topK, topKSize, {});
   THCudaLongTensor_resize(state, indices, topKSize, {});
 
+  // static_cast is required to ensure that the correct type (INDEX_T)
+  // is provided to the kernel for the arguments.
+
 #define RUN_K(INDEX_T, DIM, DIR)                                        \
   gatherTopK<real, INDEX_T, DIM, DIR>                                   \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       inputInfo,                                                        \
-      sliceSize,                                                        \
-      k,                                                                \
-      inputSlices,                                                      \
+      static_cast<INDEX_T>(sliceSize),                                  \
+      static_cast<INDEX_T>(k),                                          \
+      static_cast<INDEX_T>(inputSlices),                                \
       /* The actual dimension that the k-selection is running in */     \
       /* may have changed from collapseDims() */                        \
-      inputInfo.strides[collapseInputDim],                              \
+      static_cast<INDEX_T>(inputInfo.strides[collapseInputDim]),        \
       topKInfo,                                                         \
-      topKSlices,                                                       \
-      topKInfo.strides[collapseTopKDim],                                \
+      static_cast<INDEX_T>(topKSlices),                                 \
+      static_cast<INDEX_T>(topKInfo.strides[collapseTopKDim]),          \
       indicesInfo,                                                      \
-      indicesInfo.strides[collapseIndicesDim])
+      static_cast<INDEX_T>(indicesInfo.strides[collapseIndicesDim]))
 
 #define RUN_DIR(INDEX_T, DIM)                   \
   if (dir) {                                    \
@@ -63,6 +66,12 @@ THC_API void THCTensor_(topk)(THCState* state,
     RUN_DIR(INDEX_T, -1);                       \
   }
 
+#ifdef __HIP_PLATFORM_HCC__
+#define TOPK_WARP_SIZE 64
+#else
+#define TOPK_WARP_SIZE 32
+#endif
+
 #define RUN_T(INDEX_T)                                                  \
   TensorInfo<real, INDEX_T> inputInfo =                                 \
     getTensorInfo<real, THCTensor, INDEX_T>(state, input);              \
@@ -96,7 +105,7 @@ THC_API void THCTensor_(topk)(THCState* state,
     THError("Slice to sort is too large");                              \
   }                                                                     \
                                                                         \
-  dim3 block(std::min(THCRoundUp(sliceSize, (int64_t) 32), (int64_t) 1024)); \
+  dim3 block(std::min(THCRoundUp(sliceSize, (int64_t) TOPK_WARP_SIZE), (int64_t) 1024)); \
                                                                         \
   /* This is used as a template parameter to calculate indices. */      \
   /* We only specialize it if all collapsed dim sizes are the */        \
@@ -124,6 +133,7 @@ THC_API void THCTensor_(topk)(THCState* state,
 #undef RUN_DIM
 #undef RUN_DIR
 #undef RUN_K
+#undef TOPK_WARP_SIZE
 
   // Sort the results if the user wants them sorted, since our
   // selection routine does not ensure sorting
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 51858c802fe..7cf91fad7a9 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -558,6 +558,7 @@ endif()
 if(USE_ROCM)
  include_directories(SYSTEM ${HIP_PATH}/include)
  include_directories(SYSTEM ${ROCBLAS_PATH}/include)
+ include_directories(SYSTEM ${ROCFFT_PATH}/include)
  include_directories(SYSTEM ${HIPSPARSE_PATH}/include)
  include_directories(SYSTEM ${HIPRAND_PATH}/include)
  include_directories(SYSTEM ${ROCRAND_PATH}/include)
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 037c92d2fe3..72e6dd67a7b 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -38,6 +38,13 @@ ELSE()
   SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH})
 ENDIF()
 
+# ROCFFT_PATH
+IF(NOT DEFINED ENV{ROCFFT_PATH})
+  SET(ROCBLAS_PATH ${ROCM_PATH}/rocfft)
+ELSE()
+  SET(ROCFFT_PATH $ENV{ROCFFT_PATH})
+ENDIF()
+
 # HIPSPARSE_PATH
 IF(NOT DEFINED ENV{HIPSPARSE_PATH})
   SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse)
@@ -106,11 +113,13 @@ IF(HIP_FOUND)
   set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
   set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
   set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
+  set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
   set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
 
   find_package(rocrand REQUIRED)
   find_package(hiprand REQUIRED)
   find_package(rocblas REQUIRED)
+  find_package(rocfft REQUIRED)
   find_package(miopen REQUIRED)
   #find_package(hipsparse REQUIRED)
 
diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index e93a5bc9557..120db9d7452 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -5,6 +5,7 @@ set -ex
 install_ubuntu() {
     apt-get update
     apt-get install -y wget
+    apt-get install -y libopenblas-dev
 
     DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/debian
     # Add rocm repository
@@ -63,6 +64,15 @@ install_rocrand() {
     dpkg -i /opt/rocm/debians/rocrand.deb
 }
 
+# Install rocSPARSE/hipSPARSE that will be released soon - can co-exist w/ hcSPARSE which will be removed soon
+install_hipsparse() {
+    mkdir -p /opt/rocm/debians
+    curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.1.0.deb -o /opt/rocm/debians/rocsparse.deb
+    curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.1.0.deb -o /opt/rocm/debians/hipsparse.deb
+    dpkg -i /opt/rocm/debians/rocsparse.deb
+    dpkg -i /opt/rocm/debians/hipsparse.deb
+}
+
 # Install Python packages depending on the base OS
 if [ -f /etc/lsb-release ]; then
   install_ubuntu
@@ -76,3 +86,4 @@ fi
 install_hip_thrust
 install_rocrand
 install_hcsparse
+install_hipsparse
diff --git a/setup.py b/setup.py
index f560ee28b81..75f1c33208c 100644
--- a/setup.py
+++ b/setup.py
@@ -929,6 +929,7 @@ if USE_ROCM:
     rocm_include_path = '/opt/rocm/include'
     hcc_include_path = '/opt/rocm/hcc/include'
     rocblas_include_path = '/opt/rocm/rocblas/include'
+    rocfft_include_path = '/opt/rocm/rocfft/include'
     hipsparse_include_path = '/opt/rocm/hcsparse/include'
     hiprand_include_path = '/opt/rocm/hiprand/include'
     rocrand_include_path = '/opt/rocm/rocrand/include'
@@ -937,6 +938,7 @@ if USE_ROCM:
     include_dirs.append(rocm_include_path)
     include_dirs.append(hcc_include_path)
     include_dirs.append(rocblas_include_path)
+    include_dirs.append(rocfft_include_path)
     include_dirs.append(hipsparse_include_path)
     include_dirs.append(hiprand_include_path)
     include_dirs.append(rocrand_include_path)
diff --git a/test/common_cuda.py b/test/common_cuda.py
index 14554962728..60c28b2818c 100644
--- a/test/common_cuda.py
+++ b/test/common_cuda.py
@@ -2,12 +2,14 @@ r"""This file is allowed to initialize CUDA context when imported."""
 
 import torch
 import torch.cuda
+from common import TEST_WITH_ROCM
 
 
 TEST_CUDA = torch.cuda.is_available()
 TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
 CUDA_DEVICE = TEST_CUDA and torch.device("cuda:0")
-TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE))
+# note: if ROCm is targeted, TEST_CUDNN is code for TEST_MIOPEN
+TEST_CUDNN = TEST_CUDA and (TEST_WITH_ROCM or torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
 TEST_CUDNN_VERSION = TEST_CUDNN and torch.backends.cudnn.version()
 
 
diff --git a/test/common_nn.py b/test/common_nn.py
index 0444ba4eb6a..f159fe65967 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -7,7 +7,7 @@ from itertools import product
 import torch
 import torch.cuda
 from torch.nn.functional import _Reduction
-from common import TestCase, to_gpu, freeze_rng_state, is_iterable
+from common import TestCase, to_gpu, freeze_rng_state, is_iterable, TEST_WITH_ROCM
 from common_cuda import TEST_CUDA
 from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors
 import torch.backends.cudnn
@@ -40,7 +40,8 @@ module_tests = [
         module_name='Linear',
         constructor_args=(10, 8),
         input_size=(4, 10),
-        reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
+        reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Linear',
@@ -102,17 +103,20 @@ module_tests = [
         constructor_args=(1,),
         input_size=(10, 20),
         reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20)),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Softmax2d',
         input_size=(1, 3, 10, 20),
         reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, False)),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LogSoftmax',
         constructor_args=(1,),
         input_size=(10, 20),
         reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_(),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LogSoftmax',
@@ -120,12 +124,14 @@ module_tests = [
         input_size=(1, 3, 10, 20),
         reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(),
         desc='multiparam',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='ELU',
         constructor_args=(2.,),
         input_size=(3, 2, 5),
-        reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1))
+        reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1)),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     # TODO: reference function
     dict(
@@ -198,6 +204,7 @@ module_tests = [
         input_size=(2, 3, 4),
         desc='1d_multiparam',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='PReLU',
@@ -211,6 +218,7 @@ module_tests = [
         input_size=(2, 3, 4, 5),
         desc='2d_multiparam',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='PReLU',
@@ -224,26 +232,31 @@ module_tests = [
         input_size=(2, 3, 4, 5, 6),
         desc='3d_multiparam',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Softsign',
         input_size=(3, 2, 5),
         reference_fn=lambda i, _: i.div(1 + torch.abs(i)),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Softmin',
         constructor_args=(1,),
         input_size=(10, 20),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Softmin',
         constructor_args=(1,),
         input_size=(2, 3, 5, 10),
         desc='multidim',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Tanhshrink',
-        input_size=(2, 3, 4, 5)
+        input_size=(2, 3, 4, 5),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
 ]
 
@@ -560,6 +573,7 @@ criterion_tests = [
         reference_fn=lambda i, t, m:
             kldivloss_reference(i, t, get_reduction(m)),
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='MSELoss',
@@ -576,6 +590,7 @@ criterion_tests = [
         reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() /
             (i.numel() if get_reduction(m) else 1),
         check_gradgrad=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='BCELoss',
@@ -586,6 +601,7 @@ criterion_tests = [
             (i.numel() if get_reduction(m) else 1),
         desc='weights',
         check_gradgrad=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -606,6 +622,7 @@ criterion_tests = [
         reference_fn=lambda i, t, m:
             hingeembeddingloss_reference(i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='HingeEmbeddingLoss',
@@ -616,6 +633,7 @@ criterion_tests = [
             hingeembeddingloss_reference(i, t, margin=0.5, reduction=get_reduction(m)),
         desc='margin',
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='MultiLabelMarginLoss',
@@ -642,6 +660,7 @@ criterion_tests = [
         target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
         reference_fn=lambda i, t, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()).sum() / i.numel(),
         check_gradgrad=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='MultiMarginLoss',
@@ -720,6 +739,7 @@ criterion_tests = [
         reference_fn=lambda i, t, m:
             cosineembeddingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='CosineEmbeddingLoss',
@@ -730,6 +750,7 @@ criterion_tests = [
             cosineembeddingloss_reference(i[0], i[1], t, margin=0.7, reduction=get_reduction(m)),
         desc='margin',
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='MarginRankingLoss',
@@ -738,6 +759,7 @@ criterion_tests = [
         reference_fn=lambda i, t, m:
             marginrankingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='MarginRankingLoss',
@@ -748,6 +770,7 @@ criterion_tests = [
             marginrankingloss_reference(i[0], i[1], t, margin=0.5, reduction=get_reduction(m)),
         desc='margin',
         check_sum_reduction=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
 ]
 
diff --git a/test/run_test.py b/test/run_test.py
index 8fd32b7e75c..71b96e78bc9 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -45,14 +45,10 @@ WINDOWS_BLACKLIST = [
 ROCM_BLACKLIST = [
     'c10d',
     'cpp_extensions',
-    'cuda',
     'distributed',
     'distributions',
-    'jit',
-    'legacy_nn',
     'multiprocessing',
     'nccl',
-    'nn',
     'thd_distributed',
     'utils',
 ]
diff --git a/test/test_cuda.py b/test/test_cuda.py
index badcc687945..27e21b8345e 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -16,7 +16,7 @@ from torch._six import inf, nan
 
 from test_torch import TestTorch
 from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \
-    PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN
+    PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, skipIfRocm, TEST_WITH_ROCM
 
 # We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
 # because if we do that, the TEST_CUDNN line from common_cuda will be executed
@@ -248,45 +248,62 @@ def new_t(*sizes):
 # - disable inplace test, if set to True, no inplace test will be done (default=False)
 # - decorator, e.g., unittest.skipIf (default is no decorator)
 tests = [
-    ('add', small_3d, lambda t: [number(3.14, 3, t)]),
+    ('add', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
+        "skipIfRocm:ByteTensor,CharTensor,HalfTensor,ShortTensor"),
     ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
-    ('sub', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('sub', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
+        "skipIfRocm:ByteTensor,CharTensor,HalfTensor,ShortTensor"),
     ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('mul', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('mul', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
+        "skipIfRocm:ByteTensor,CharTensor,HalfTensor,ShortTensor"),
     ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('div', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('div', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
+        "skipIfRocm:ByteTensor,CharTensor,FloatTensor,HalfTensor,ShortTensor"),
     ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
-    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types),
-    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types),
-    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types),
-    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types, False, "skipIfRocm:HalfTensor"),
     # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
-    ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half),
-    ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types),
-    ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
-    ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
-    ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
-    ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
-    ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
-    ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
-    ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],),
-    ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t),
-                                            tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], 'scalar'),
-    ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)],),
-    ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
-    ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)],),
-    ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
-    ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
-    ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
-    ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
-    ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
-    ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)],),
-    ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
-    ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
+    ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half,
+        False, "skipIfRocm:HalfTensor,FloatTensor"),
+    ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types, False, "skipIfRocm:HalfTensor"),
+    ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types, False, "skipIfRocm:HalfTensor"),
+    ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)], '', types, False, "skipIfRocm:HalfTensor"),
+    ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], '',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],
+        'scalar', types, False, "skipIfRocm:HalfTensor"),
+    ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)], '', types, False, "skipIfRocm:HalfTensor"),
+    ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)], '', types, False, "skipIfRocm:HalfTensor"),
+    ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"),
+    ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"),
+    ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar',
+        types, False, "skipIfRocm:HalfTensor"),
+    ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars',
+        types, False, "skipIfRocm:HalfTensor"),
     ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]),
-    ('fmod', small_3d, lambda t: [3], 'value'),
+    ('fmod', small_3d, lambda t: [3], 'value', types, False, "skipIfRocm:HalfTensor"),
     ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('chunk', medium_2d, lambda t: [4],),
     ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
@@ -296,15 +313,15 @@ tests = [
     ('clone', medium_2d, lambda t: [],),
     ('contiguous', medium_2d, lambda t: [],),
     ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
-    ('cumprod', small_3d, lambda t: [1],),
-    ('cumprod', small_3d, lambda t: [-1], 'neg_dim'),
-    ('cumsum', small_3d, lambda t: [1],),
-    ('cumsum', small_3d, lambda t: [-1], 'neg_dim'),
+    ('cumprod', small_3d, lambda t: [1], '', types, False, "skipIfRocm:HalfTensor"),
+    ('cumprod', small_3d, lambda t: [-1], 'neg_dim', types, False, "skipIfRocm:HalfTensor"),
+    ('cumsum', small_3d, lambda t: [1], '', types, False, "skipIfRocm:HalfTensor"),
+    ('cumsum', small_3d, lambda t: [-1], 'neg_dim', types, False, "skipIfRocm:HalfTensor"),
     ('dim', small_3d, lambda t: [],),
-    ('dist', small_2d, lambda t: [small_2d(t)],),
-    ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
-    ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
-    ('dot', medium_1d, lambda t: [medium_1d(t)],),
+    ('dist', small_2d, lambda t: [small_2d(t)], '', types, False, "skipIfRocm:HalfTensor"),
+    ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm', types, False, "skipIfRocm:HalfTensor"),
+    ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm', types, False, "skipIfRocm:HalfTensor"),
+    ('dot', medium_1d, lambda t: [medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"),
     ('element_size', medium_1d, lambda t: [],),
     ('eq', small_3d_ones, lambda t: [small_3d(t)],),
     ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
@@ -314,7 +331,7 @@ tests = [
     ('equal', small_3d_ones, lambda t: [small_3d(t)],),
     ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
     ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
-    ('fill', medium_2d, lambda t: [number(3.14, 3, t)],),
+    ('fill', medium_2d, lambda t: [number(3.14, 3, t)], '', types, False, "skipIfRocm:HalfTensor"),
     ('ge', medium_2d, lambda t: [medium_2d(t)],),
     ('le', medium_2d, lambda t: [medium_2d(t)],),
     ('gt', medium_2d, lambda t: [medium_2d(t)],),
@@ -328,31 +345,33 @@ tests = [
     ('kthvalue', small_3d_unique, lambda t: [3],),
     ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
     ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'),
-    ('lerp', small_3d, lambda t: [small_3d(t), 0.3],),
-    ('max', small_3d_unique, lambda t: [],),
-    ('max', small_3d_unique, lambda t: [1], 'dim'),
-    ('max', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('lerp', small_3d, lambda t: [small_3d(t), 0.3], '', types, False, "skipIfRocm:HalfTensor"),
+    ('max', small_3d_unique, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
+    ('max', small_3d_unique, lambda t: [1], 'dim', types, False, skipIfRocm),
+    ('max', small_3d_unique, lambda t: [-1], 'neg_dim', types, False, skipIfRocm),
     ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
-    ('min', small_3d_unique, lambda t: [],),
-    ('min', small_3d_unique, lambda t: [1], 'dim'),
-    ('min', small_3d_unique, lambda t: [-1], 'neg_dim'),
+    ('min', small_3d_unique, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
+    ('min', small_3d_unique, lambda t: [1], 'dim', types, False, skipIfRocm),
+    ('min', small_3d_unique, lambda t: [-1], 'neg_dim', types, False, skipIfRocm),
     ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
-    ('mean', small_3d, lambda t: [],),
-    ('mean', small_3d, lambda t: [-1], 'neg_dim'),
-    ('mean', small_3d, lambda t: [1], 'dim'),
-    ('mode', small_3d, lambda t: [],),
-    ('mode', small_3d, lambda t: [1], 'dim'),
-    ('mode', small_3d, lambda t: [-1], 'neg_dim'),
-    ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half),
-    ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half),
-    ('remainder', small_3d, lambda t: [3], 'value'),
+    ('mean', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
+    ('mean', small_3d, lambda t: [-1], 'neg_dim', types, False, "skipIfRocm:DoubleTensor,FloatTensor,HalfTensor"),
+    ('mean', small_3d, lambda t: [1], 'dim', types, False, "skipIfRocm:DoubleTensor,FloatTensor,HalfTensor"),
+    ('mode', small_3d, lambda t: [], '', types, False, skipIfRocm),
+    ('mode', small_3d, lambda t: [1], 'dim', types, False, skipIfRocm),
+    ('mode', small_3d, lambda t: [-1], 'neg_dim', types, False, skipIfRocm),
+    ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half,
+        False, "skipIfRocm:DoubleTensor,FloatTensor"),
+    ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half,
+        False, "skipIfRocm:DoubleTensor,FloatTensor"),
+    ('remainder', small_3d, lambda t: [3], 'value', types, False, "skipIfRocm:HalfTensor"),
     ('remainder', small_3d, lambda t: [-3], 'negative_value', signed_types),
     ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('remainder', small_3d, lambda t: [constant_tensor_sub(0, small_3d_positive(t))], 'negative_tensor', signed_types),
-    ('std', small_3d, lambda t: [],),
+    ('std', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
     ('std', small_3d, lambda t: [1], 'dim'),
     ('std', small_3d, lambda t: [-1], 'neg_dim'),
-    ('var', small_3d, lambda t: [],),
+    ('var', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
     ('var', small_3d, lambda t: [1], 'dim'),
     ('var', small_3d, lambda t: [-1], 'neg_dim'),
     ('ndimension', small_3d, lambda t: [],),
@@ -360,34 +379,37 @@ tests = [
     ('numel', small_3d, lambda t: [],),
     ('narrow', small_3d, lambda t: [1, 3, 2],),
     ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'),
-    ('nonzero', small_3d, lambda t: [],),
-    ('norm', small_3d, lambda t: [],),
-    ('norm', small_3d, lambda t: [3], '3_norm'),
-    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
-    ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'),
+    ('nonzero', small_3d, lambda t: [], '', types, False, skipIfRocm),
+    ('norm', small_3d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
+    ('norm', small_3d, lambda t: [3], '3_norm', types, False, "skipIfRocm:HalfTensor"),
+    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim', types, False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"),
+    ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim', types,
+        False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"),
     ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
     ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
-    ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])],),
+    ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])], '', types, False, skipIfRocm),
     ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'),
     ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'),
-    ('prod', small_2d_oneish, lambda t: [],),
-    ('prod', small_3d, lambda t: [1], 'dim'),
-    ('prod', small_3d, lambda t: [-1], 'neg_dim'),
-    ('sum', small_2d, lambda t: [],),
-    ('sum', small_3d, lambda t: [1], 'dim'),
-    ('sum', small_3d, lambda t: [-1], 'neg_dim'),
-    ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
-    ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'),
-    ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
+    ('prod', small_2d_oneish, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
+    ('prod', small_3d, lambda t: [1], 'dim', types, False, skipIfRocm),
+    ('prod', small_3d, lambda t: [-1], 'neg_dim', types, False, skipIfRocm),
+    ('sum', small_2d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
+    ('sum', small_3d, lambda t: [1], 'dim', types, False, skipIfRocm),
+    ('sum', small_3d, lambda t: [-1], 'neg_dim', types, False, skipIfRocm),
+    ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm', types, False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"),
+    ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim', types,
+        False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"),
+    ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm', types,
+        False, "skipIfRocm:HalfTensor,DoubleTensor,FloatTensor"),
     ('repeat', small_2d, lambda t: [2, 2, 2],),
     ('size', new_t(1, 2, 3, 4), lambda t: [],),
     ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'),
     ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'),
-    ('sort', small_3d_unique, lambda t: [],),
-    ('sort', small_3d_unique, lambda t: [1], 'dim'),
-    ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'),
-    ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
-    ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'),
+    ('sort', small_3d_unique, lambda t: [], '', types, False, skipIfRocm),
+    ('sort', small_3d_unique, lambda t: [1], 'dim', types, False, skipIfRocm),
+    ('sort', small_3d_unique, lambda t: [-1], 'neg_dim', types, False, skipIfRocm),
+    ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending', types, False, skipIfRocm),
+    ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending', types, False, skipIfRocm),
     ('split', small_3d, lambda t: [2],),
     ('split', small_3d, lambda t: [2, 1], 'dim'),
     ('split', small_3d, lambda t: [2, -3], 'neg_dim'),
@@ -395,14 +417,14 @@ tests = [
     ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
     ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'),
     ('t', new_t(1, 2), lambda t: [],),
-    ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])],),
+    ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])], '', types, False, skipIfRocm),
     ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
     ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'),
     ('to_list', small_3d, lambda t: [],),
-    ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort'),
-    ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort'),
-    ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort'),
-    ('trace', medium_2d, lambda t: [],),
+    ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort', types, False, skipIfRocm),
+    ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort', types, False, skipIfRocm),
+    ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort', types, False, skipIfRocm),
+    ('trace', medium_2d, lambda t: [], '', types, False, "skipIfRocm:HalfTensor"),
     ('tril', medium_2d, lambda t: [],),
     ('tril', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
     ('tril', medium_2d, lambda t: [2], 'positive'),
@@ -443,7 +465,7 @@ tests = [
         unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
     ('qr', large_2d_lapack, lambda t: [], 'big', float_types, False,
         unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('inverse', new_t(20, 20), lambda t: [], None, float_types, False),
+    ('inverse', new_t(20, 20), lambda t: [], None, float_types, False, "skipIfRocm:DoubleTensor,FloatTensor"),
     ('geqrf', new_t(20, 20), lambda t: [], None, float_types, False,
         unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
     ('svd', new_t(10, 10), lambda t: [], 'square', float_types_no_half, False,
@@ -760,6 +782,7 @@ class TestCuda(TestCase):
             pass
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_memory_stats_multigpu(self):
         # advance a generator with a end flag
         def advance(gen, end):
@@ -797,6 +820,7 @@ class TestCuda(TestCase):
                 t += 1
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_autogpu(self):
         x = torch.randn(5, 5).cuda()
         y = torch.randn(5, 5).cuda()
@@ -814,6 +838,7 @@ class TestCuda(TestCase):
         self.assertEqual(z.get_device(), 0)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_new(self):
         x = torch.randn(3, 3).cuda()
         self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
@@ -824,6 +849,7 @@ class TestCuda(TestCase):
             self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_copy_device(self):
         x = torch.randn(5, 5).cuda()
         with torch.cuda.device(1):
@@ -877,6 +903,7 @@ class TestCuda(TestCase):
         self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_type_conversions_same_gpu(self):
         x = torch.randn(5, 5).cuda(1)
         self.assertEqual(x.int().get_device(), 1)
@@ -896,12 +923,15 @@ class TestCuda(TestCase):
             if input.is_cuda and input.get_device() == i:
                 self.assertEqual(t.data_ptr(), input.data_ptr())
 
+    @skipIfRocm
     def test_broadcast_cpu(self):
         self._test_broadcast(torch.randn(5, 5))
 
+    @skipIfRocm
     def test_broadcast_gpu(self):
         self._test_broadcast(torch.randn(5, 5).cuda())
 
+    @skipIfRocm
     def test_min_max_nan(self):
         tests = [(lambda x: x.min(), 'min'),
                  (lambda x: x.max(), 'max'),
@@ -932,6 +962,7 @@ class TestCuda(TestCase):
             self.assertIsInstance(bct, type(bt))
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_broadcast_coalesced(self):
         numel = 5
         num_bytes = numel * 8
@@ -952,6 +983,7 @@ class TestCuda(TestCase):
         self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_broadcast_coalesced_dense_only(self):
         numel = 5
         num_bytes = numel * 8
@@ -966,6 +998,7 @@ class TestCuda(TestCase):
         self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_reduce_add(self):
         x = torch.randn(5, 5)
         y = torch.randn(5, 5)
@@ -992,6 +1025,7 @@ class TestCuda(TestCase):
             self.assertEqual(rc.type(), r.type())
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_reduce_add_coalesced(self):
         numel = 5
         num_bytes = numel * 8
@@ -1012,6 +1046,7 @@ class TestCuda(TestCase):
         self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_reduce_add_coalesced_dense_only(self):
         numel = 5
         num_bytes = numel * 8
@@ -1083,12 +1118,15 @@ class TestCuda(TestCase):
         index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim))
         self.assertEqual(result[tuple(index)], y)
 
+    @skipIfRocm
     def test_gather(self):
         self._test_gather(0)
 
+    @skipIfRocm
     def test_gather_dim(self):
         self._test_gather(1)
 
+    @skipIfRocm
     def test_from_sequence(self):
         seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)]
         reference = torch.arange(0, 20).resize_(5, 4)
@@ -1119,6 +1157,7 @@ class TestCuda(TestCase):
             self.assertEqual(torch.cuda.initial_seed(), 2)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_cat_autogpu(self):
         x = torch.randn(4, 4).cuda(1)
         y = torch.randn(4, 4).cuda(1)
@@ -1146,9 +1185,11 @@ class TestCuda(TestCase):
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))
 
+    @skipIfRocm
     def test_cat_empty_legacy(self):
         TestTorch._test_cat_empty_legacy(self, use_cuda=True)
 
+    @skipIfRocm
     def test_cat_empty(self):
         TestTorch._test_cat_empty(self, use_cuda=True)
 
@@ -1203,6 +1244,7 @@ class TestCuda(TestCase):
             self.assertEqual(copy.get_device(), original.get_device())
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @skipIfRocm
     def test_multigpu_serialization(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
         with tempfile.NamedTemporaryFile() as f:
@@ -1215,6 +1257,7 @@ class TestCuda(TestCase):
             self.assertEqual(copy.get_device(), original.get_device())
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @skipIfRocm
     def test_multigpu_serialization_remap(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
 
@@ -1233,6 +1276,7 @@ class TestCuda(TestCase):
             self.assertEqual(copy.get_device(), 0)
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @skipIfRocm
     def test_multigpu_serialization_remap_dict(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
         with tempfile.NamedTemporaryFile() as f:
@@ -1245,6 +1289,7 @@ class TestCuda(TestCase):
             self.assertEqual(copy.get_device(), 0)
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @skipIfRocm
     def test_cuda_set_device(self):
         x = torch.randn(5, 5)
         with torch.cuda.device(1):
@@ -1266,6 +1311,7 @@ class TestCuda(TestCase):
     def test_cuda_synchronize(self):
         torch.cuda.synchronize()
 
+    @skipIfRocm
     def test_streams(self):
         default_stream = torch.cuda.current_stream()
         user_stream = torch.cuda.Stream()
@@ -1284,6 +1330,7 @@ class TestCuda(TestCase):
         self.assertTrue(default_stream.query())
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @skipIfRocm
     def test_streams_multi_gpu(self):
         default_stream = torch.cuda.current_stream()
         self.assertEqual(default_stream.device, 0)
@@ -1294,6 +1341,7 @@ class TestCuda(TestCase):
             self.assertNotEqual(torch.cuda.current_stream(), default_stream)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_tensor_device(self):
         self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0)
         self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1)
@@ -1302,6 +1350,7 @@ class TestCuda(TestCase):
             self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0)
             self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1)
 
+    @skipIfRocm
     def test_events(self):
         stream = torch.cuda.current_stream()
         event = torch.cuda.Event(enable_timing=True)
@@ -1315,6 +1364,7 @@ class TestCuda(TestCase):
         self.assertTrue(event.query())
         self.assertGreater(start_event.elapsed_time(event), 0)
 
+    @skipIfRocm
     def test_record_stream(self):
         cycles_per_ms = get_cycles_per_ms()
 
@@ -1352,6 +1402,7 @@ class TestCuda(TestCase):
         x = torch.arange(0, 10).view((2, 5))
         self.assertEqual(x.t(), x.t().pin_memory())
 
+    @skipIfRocm
     def test_caching_pinned_memory(self):
         cycles_per_ms = get_cycles_per_ms()
 
@@ -1372,6 +1423,7 @@ class TestCuda(TestCase):
         self.assertEqual(list(gpu_tensor), [1])
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_caching_pinned_memory_multi_gpu(self):
         # checks that the events preventing pinned memory from being re-used
         # too early are recorded on the correct GPU
@@ -1432,6 +1484,7 @@ class TestCuda(TestCase):
     def test_signal_window_functions(self):
         TestTorch._test_signal_window_functions(self, device=torch.device('cuda'))
 
+    @skipIfRocm
     def test_fft_ifft_rfft_irfft(self):
         TestTorch._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
 
@@ -1463,6 +1516,7 @@ class TestCuda(TestCase):
     def test_stft(self):
         TestTorch._test_stft(self, device=torch.device('cuda'))
 
+    @skipIfRocm
     def test_multinomial(self):
         TestTorch._test_multinomial(self, torch.cuda.FloatTensor)
 
@@ -1528,6 +1582,7 @@ class TestCuda(TestCase):
         self._spawn_method(test_method, torch.Tensor([0, -inf]))
         self._spawn_method(test_method, torch.Tensor([0, nan]))
 
+    @skipIfRocm
     def test_broadcast(self):
         TestTorch._test_broadcast(self, lambda t: t.cuda())
 
@@ -1540,12 +1595,15 @@ class TestCuda(TestCase):
     def test_broadcast_batched_matmul(self):
         TestTorch._test_broadcast_batched_matmul(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_index(self):
         TestTorch._test_index(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_advancedindex(self):
         TestTorch._test_advancedindex(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_advancedindex_mixed_cpu_cuda(self):
         def test(x, ia, ib):
             # test getitem
@@ -1594,30 +1652,37 @@ class TestCuda(TestCase):
                 ib = ib.to(other_device)
                 test(x, ia, ib)
 
+    @skipIfRocm
     def test_advancedindex_big(self):
         TestTorch._test_advancedindex_big(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_btrifact(self):
         TestTorch._test_btrifact(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_btrisolve(self):
         TestTorch._test_btrisolve(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_dim_reduction(self):
         TestTorch._test_dim_reduction(self, lambda t: t.cuda())
 
+    @skipIfRocm
     def test_tensor_gather(self):
         TestTorch._test_gather(self, lambda t: t.cuda(), False)
 
     def test_tensor_scatter(self):
         TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', test_bounds=False)
 
+    @skipIfRocm
     def test_tensor_scatterAdd(self):
         TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_add_', test_bounds=False)
 
     def test_tensor_scatterFill(self):
         TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', True, test_bounds=False)
 
+    @skipIfRocm
     def test_min_max_inits(self):
         # Testing if THC_reduceAll received the correct index initialization.
         # This affects the result of THC_reduceAll operations at extreme values
@@ -1692,6 +1757,7 @@ class TestCuda(TestCase):
         tensor = tensor.unsqueeze(1)
         self.assertEqual(tensor.var(0), 0.03125)
 
+    @skipIfRocm
     def test_digamma(self):
         def test(use_double=False):
             cpu_tensor = torch.randn(10, 10, 10)
@@ -1720,6 +1786,7 @@ class TestCuda(TestCase):
         norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
         self.assertEqual(norm_errors, expected_errors)
 
+    @skipIfRocm
     def test_polygamma(self):
         def test(use_double=False):
             cpu_tensor = torch.randn(10, 10, 10)
@@ -1771,6 +1838,7 @@ class TestCuda(TestCase):
         TestTorch._test_trtrs(self, lambda t: t.cuda())
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
     def test_get_set_rng_state_all(self):
         states = torch.cuda.get_rng_state_all()
         before0 = torch.cuda.FloatTensor(100, device=0).normal_()
@@ -1781,12 +1849,14 @@ class TestCuda(TestCase):
         self.assertEqual(before0, after0, 0)
         self.assertEqual(before1, after1, 0)
 
+    @skipIfRocm
     def test_nvtx(self):
         # Just making sure we can see the symbols
         torch.cuda.nvtx.range_push("foo")
         torch.cuda.nvtx.mark("bar")
         torch.cuda.nvtx.range_pop()
 
+    @skipIfRocm
     def test_randperm_cuda(self):
         cuda = torch.device('cuda:0')
 
@@ -1825,6 +1895,7 @@ class TestCuda(TestCase):
     def test_random_neg_values(self):
         TestTorch._test_random_neg_values(self, use_cuda=True)
 
+    @skipIfRocm
     def test_bincount_cuda(self):
         TestTorch._test_bincount(self, device='cuda')
         # ensure CUDA code coverage
@@ -1846,6 +1917,7 @@ class TestCuda(TestCase):
         self.assertEqual(t.cpu().bincount(), t.bincount())
         self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
 
+    @skipIfRocm
     def test_tiny_half_norm_(self):
         a = torch.arange(25).cuda().float()
         a /= 100000000
@@ -1909,6 +1981,25 @@ def generate_tests():
 
             if t not in type_subset:
                 continue
+            if TEST_WITH_ROCM and decorator is not None:
+                if (isinstance(decorator, str)):
+                    tensor_type_name = str(t.__name__)
+                    decorator_list = decorator.split(":")
+                    skip_type_list = decorator_list[1].split(",")
+                    if (("ByteTensor" in skip_type_list) and tensor_type_name == "ByteTensor") \
+                            or (("CharTensor" in skip_type_list) and tensor_type_name == "CharTensor") \
+                            or (("DoubleTensor" in skip_type_list) and tensor_type_name == "DoubleTensor") \
+                            or (("FloatTensor" in skip_type_list) and tensor_type_name == "FloatTensor") \
+                            or (("HalfTensor" in skip_type_list) and tensor_type_name == "HalfTensor") \
+                            or (("IntTensor" in skip_type_list) and tensor_type_name == "IntTensor") \
+                            or (("LongTensor" in skip_type_list) and tensor_type_name == "LongTensor") \
+                            or (("ShortTensor" in skip_type_list) and tensor_type_name == "ShortTensor"):
+                        decorator = skipIfRocm
+                    else:
+                        decorator = None
+            elif ((not TEST_WITH_ROCM) and (decorator is not None)):
+                if (isinstance(decorator, str)):
+                    decorator = None
 
             precision = custom_precision.get(name, TestCuda.precision)
             if is_half(t):
diff --git a/test/test_jit.py b/test/test_jit.py
index 4bda08d030c..81e8d1affa1 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -608,6 +608,7 @@ class TestJit(JitTestCase):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
     def test_fusion_rand(self):
         class M(torch.jit.ScriptModule):
             __constants__ = ['d']
@@ -631,6 +632,7 @@ class TestJit(JitTestCase):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
     def test_fusion_arg_configurations(self):
         # A smoke test to make sure we won't use the same kernel for contiguous
         # and non-contiguous arguments.
@@ -846,6 +848,7 @@ class TestJit(JitTestCase):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA_MULTI_GPU, "needs non-zero device")
+    @skipIfRocm
     def test_fuse_last_device(self):
         device = 'cuda:' + str(1)
         x = torch.tensor([0.4], dtype=torch.float, device=device)
@@ -2521,6 +2524,7 @@ a")
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    @skipIfRocm
     def test_chunk_fusion_cuda(self):
         def fn(x):
             a, b, c = x.chunk(3, 1)
@@ -2536,6 +2540,7 @@ a")
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    @skipIfRocm
     def test_chunk_multiple_fusion_cuda(self):
         # The arguments are intentionally used out of order as a test to see
         # if the fusion compiler adds extra args in the correct order
@@ -2589,11 +2594,13 @@ a")
                 self.checkScript(fn, [tensor])
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @skipIfRocm
     def test_chunk_fusion_correctness(self):
         return self._test_chunk_fusion_correctness(self, 'cpu')
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    @skipIfRocm
     def test_chunk_fusion_correctness_cuda(self):
         return self._test_chunk_fusion_correctness(self, 'cuda')
 
@@ -6457,6 +6464,7 @@ class TestEndToEndHybridFrontendModels(JitTestCase):
 
         self.checkTrace(Policy(), (torch.rand(1, 4),))
 
+    @skipIfRocm
     def test_snli(self):
         # TODO:
         #   1) nn.LSTM is called as a Python function https://github.com/pytorch/pytorch/issues/8449
@@ -6549,6 +6557,7 @@ class TestEndToEndHybridFrontendModels(JitTestCase):
 
         self.checkTrace(SNLIClassifier(Config()), (premise, hypothesis), inputs_require_grads=False)
 
+    @skipIfRocm
     def test_super_resolution(self):
         import torch.nn.init as init
 
@@ -6704,6 +6713,7 @@ class TestPytorchExportModes(JitTestCase):
                            export_type=torch.onnx.ExportTypes.DIRECTORY)
         shutil.rmtree(d)
 
+    @skipIfRocm
     def test_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index de65e6fc8ce..b446920c4fe 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -6,7 +6,7 @@ from copy import deepcopy
 
 import torch
 import torch.legacy.nn as nn
-from common import to_gpu, freeze_rng_state, run_tests
+from common import to_gpu, freeze_rng_state, run_tests, skipIfRocm, TEST_WITH_ROCM
 from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \
     module_tests, criterion_tests, PRECISION
 from torch.autograd.gradcheck import get_numerical_jacobian
@@ -66,33 +66,40 @@ tests = [
                   constructor_args=(3.5,),
                   input_size=(3, 5, 4),
                   reference_fn=lambda i, _: i + 3.5,
-                  check_inplace=True),
+                  check_inplace=True,
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.BatchNormalization,
                   constructor_args=(10,),
                   input_size=(4, 10),
-                  desc='affine'),
+                  desc='affine',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.BatchNormalization,
                   constructor_args=(10, 1e-3, 0.3, False),
                   input_size=(4, 10),
-                  desc='not_affine'),
+                  desc='not_affine',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialBatchNormalization,
                   constructor_args=(3,),
-                  input_size=(2, 3, 6, 6)),
+                  input_size=(2, 3, 6, 6),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialBatchNormalization,
                   constructor_args=(3, 1e-3, 0.8),
                   input_size=(2, 3, 6, 6),
-                  desc='momentum'),
+                  desc='momentum',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialBatchNormalization,
                   constructor_args=(3, 1e-3, 0.8, False),
                   input_size=(2, 3, 6, 6),
                   desc='no_affine'),
     OldModuleTest(nn.VolumetricBatchNormalization,
                   constructor_args=(3,),
-                  input_size=(2, 3, 4, 4, 4)),
+                  input_size=(2, 3, 4, 4, 4),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.VolumetricBatchNormalization,
                   constructor_args=(3, 1e-3, 0.7),
                   input_size=(2, 3, 4, 4, 4),
-                  desc='momentum'),
+                  desc='momentum',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.VolumetricBatchNormalization,
                   constructor_args=(3, 1e-3, 0.7, False),
                   input_size=(2, 3, 4, 4, 4),
@@ -100,52 +107,67 @@ tests = [
     OldModuleTest(nn.CMul,
                   constructor_args=(5, 6),
                   input_size=(10, 5, 6),
-                  desc='3D'),
+                  desc='3D',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.CMul,
                   constructor_args=(50, 4),
                   input_size=(1, 50, 4),
-                  desc='3D_single_example'),
+                  desc='3D_single_example',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.CMul,
                   constructor_args=(1, 5),
                   input_fn=lambda: torch.randn(10, 3, 5)[:, 1],
-                  desc='3D_noncontiguous'),
+                  desc='3D_noncontiguous',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Exp,
                   input_size=(2, 3, 4),
-                  reference_fn=lambda i, _: i.exp()),
+                  reference_fn=lambda i, _: i.exp(),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Log,
                   input_fn=lambda: torch.rand(2, 3, 2) + 0.1,
-                  reference_fn=lambda i, _: i.log()),
+                  reference_fn=lambda i, _: i.log(),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Clamp,
                   constructor_args=(-2., 5.),
                   input_fn=lambda: torch.randn(3, 2, 50) * 6,
                   reference_fn=lambda i, _: i.clamp(-2, 5)),
     OldModuleTest(nn.Abs,
                   input_size=(3, 20, 5),
-                  reference_fn=lambda i, _: i.abs()),
+                  reference_fn=lambda i, _: i.abs(),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Bilinear,
                   constructor_args=(2, 3, 10),
-                  input_size=[(4, 2), (4, 3)]),
+                  input_size=[(4, 2), (4, 3)],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Bilinear,
                   constructor_args=(5, 4, 2),
                   input_size=[(2, 5), (2, 4)],
-                  desc='small_output'),
+                  desc='small_output',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Euclidean,
                   constructor_args=(5, 7),
-                  input_size=(10, 5)),
+                  input_size=(10, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.WeightedEuclidean,
                   constructor_args=(5, 7),
-                  input_size=(10, 5)),
+                  input_size=(10, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Cosine,
                   constructor_args=(5, 7),
-                  input_size=(10, 5)),
+                  input_size=(10, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.CAddTable,
-                  input_size=[(5, 7), (5, 7)]),
+                  input_size=[(5, 7), (5, 7)],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.CSubTable,
-                  input_size=[(5, 7), (5, 7)]),
+                  input_size=[(5, 7), (5, 7)],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.CDivTable,
-                  input_fn=lambda: [torch.randn(1, 7), torch.rand(1, 7) + 0.1]),
+                  input_fn=lambda: [torch.randn(1, 7), torch.rand(1, 7) + 0.1],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.CMulTable,
-                  input_size=[(5, 7), (5, 7)]),
+                  input_size=[(5, 7), (5, 7)],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Square,
                   input_size=(10, 2, 4),
                   reference_fn=lambda i, _: i.mul(i)),
@@ -192,31 +214,37 @@ tests = [
     OldModuleTest(nn.Sum,
                   constructor_args=(1,),
                   input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: i.sum(1, keepdim=False)),
+                  reference_fn=lambda i, _: i.sum(1, keepdim=False),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Sum,
                   constructor_args=(1, True),
                   input_size=(2, 4, 5),
                   reference_fn=lambda i, _: i.sum(1, keepdim=False).div(i.size(1)),
-                  desc='sizeAverage'),
+                  desc='sizeAverage',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Mean,
                   constructor_args=(1,),
                   input_size=(2, 4, 5),
-                  reference_fn=lambda i, _: torch.mean(i, 1, keepdim=False)),
+                  reference_fn=lambda i, _: torch.mean(i, 1, keepdim=False),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(lambda: nn.Sequential().add(nn.GradientReversal()).add(nn.GradientReversal()),
                   input_size=(4, 3, 2, 2),
-                  fullname='GradientReversal'),
+                  fullname='GradientReversal',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Identity,
                   input_size=(4, 3, 2, 4),
                   reference_fn=lambda i, _: i),
     OldModuleTest(nn.DotProduct,
                   input_size=[(10, 4), (10, 4)],
                   reference_fn=lambda i, _: torch.Tensor(list(
-                      a.dot(b) for a, b in zip(i[0], i[1])))
+                      a.dot(b) for a, b in zip(i[0], i[1]))),
+                  test_cuda=(not TEST_WITH_ROCM)
                   ),
     OldModuleTest(nn.CosineDistance,
                   input_size=[(10, 4), (10, 4)],
                   reference_fn=lambda i, _: torch.Tensor(list(
-                      a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1])))
+                      a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1]))),
+                  test_cuda=(not TEST_WITH_ROCM)
                   ),
     OldModuleTest(nn.JoinTable,
                   constructor_args=(0,),
@@ -256,19 +284,23 @@ tests = [
                   reference_fn=lambda i, _: torch.min(i, 1, False)[0],
                   desc='with_dimension'),
     OldModuleTest(nn.MixtureTable,
-                  input_size=[(5, 3), (5, 3, 6)]),
+                  input_size=[(5, 3), (5, 3, 6)],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.LookupTable,
                   constructor_args=(4, 3),
                   input_fn=lambda: torch.randperm(2).repeat(1, 2),
-                  jacobian_input=False),
+                  jacobian_input=False,
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Mul,
                   input_size=(2, 3, 4, 2),
-                  reference_fn=lambda i, p: i * p[0][0]),
+                  reference_fn=lambda i, p: i * p[0][0],
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.MulConstant,
                   constructor_args=(4,),
                   input_size=(2, 3, 4, 2),
                   reference_fn=lambda i, _: i * 4,
-                  check_inplace=True),
+                  check_inplace=True,
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Narrow,
                   constructor_args=(0, 0),
                   input_size=(2, 3, 4, 2),
@@ -291,7 +323,8 @@ tests = [
     OldModuleTest(nn.Replicate,
                   constructor_args=(2, 1),
                   input_size=(10, 3, 4, 5),
-                  reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5)),
+                  reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Padding,
                   constructor_args=(0, 2, -10),
                   input_size=(2, 3, 4, 5)),
@@ -305,17 +338,21 @@ tests = [
                   desc='negative_pad'),
     OldModuleTest(nn.PartialLinear,
                   constructor_args=(5, 6),
-                  input_size=(4, 5)),
+                  input_size=(4, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(lambda: nn.PartialLinear(5, 6).setPartition(torch.Tensor((2, 4))),
                   input_size=(4, 5),
-                  fullname='PartialLinear_setPartition'),
+                  fullname='PartialLinear_setPartition',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Power,
                   constructor_args=(2,),
-                  input_size=(2, 3, 4, 5)),
+                  input_size=(2, 3, 4, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Power,
                   constructor_args=(1.5,),
                   input_fn=lambda: torch.rand(3, 4, 5),
-                  desc='fractional'),
+                  desc='fractional',
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.Reshape,
                   constructor_args=(4, 5),
                   input_size=(3, 4 * 5),
@@ -375,10 +412,12 @@ tests = [
                   desc='stride_pad'),
     OldModuleTest(nn.SpatialDivisiveNormalization,
                   constructor_args=(3,),
-                  input_size=(2, 3, 8, 8)),
+                  input_size=(2, 3, 8, 8),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialContrastiveNormalization,
                   constructor_args=(3,),
-                  input_size=(2, 3, 8, 8)),
+                  input_size=(2, 3, 8, 8),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialDilatedConvolution,
                   constructor_args=(3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
                   input_size=(2, 3, 8, 8)),
@@ -436,13 +475,15 @@ tests = [
                   input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialLPPooling,
                   constructor_args=(3, 2, 2, 2, 2, 2),
-                  input_size=(1, 3, 7, 7)),
+                  input_size=(1, 3, 7, 7),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialSubSampling,
                   constructor_args=(3, 3, 3, 2, 2),
                   input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialSubtractiveNormalization,
                   constructor_args=(3,),
-                  input_size=(1, 3, 7, 7)),
+                  input_size=(1, 3, 7, 7),
+                  test_cuda=(not TEST_WITH_ROCM)),
     OldModuleTest(nn.SpatialSubtractiveNormalization,
                   constructor_args=(3, torch.rand(3)),
                   input_size=(1, 3, 7, 7),
@@ -521,7 +562,8 @@ tests = [
     CriterionTest(nn.WeightedMSECriterion,
                   constructor_args_fn=lambda: (torch.rand(3, 4, 5),),
                   input_size=(2, 3, 4, 5),
-                  target_size=(2, 3, 4, 5)),
+                  target_size=(2, 3, 4, 5),
+                  test_cuda=(not TEST_WITH_ROCM)),
     CriterionTest(nn.MarginCriterion,
                   input_size=(5, 10),
                   target_fn=lambda: torch.randn(5, 10).sign()),
@@ -544,14 +586,16 @@ for p in (1, 2, 1.5):
                       input_size=(4, 5),
                       # Eh, we need to use p as a default, so it's passed by value
                       reference_fn=lambda i, _, p=p: i.div(i.norm(p, 1, True).expand_as(i)),
-                      desc=str(p)),
+                      desc=str(p),
+                      test_cuda=(not TEST_WITH_ROCM)),
     )
 for p in range(1, 4 + 1):
     tests.append(
         OldModuleTest(nn.PairwiseDistance,
                       constructor_args=(p,),
                       input_size=[(4, 10), (4, 10)],
-                      desc=str(p))
+                      desc=str(p),
+                      test_cuda=(not TEST_WITH_ROCM))
     )
 
 
@@ -613,6 +657,10 @@ def prepare_tests():
         'KLDivLoss': 'DistKLDivCriterion',
     }
     for test in tests:
+        name = test.get_name()
+        if ((name == "test_Max" or name == "test_Min" or name == "test_Max_with_dimension" or
+           name == "test_Min_with_dimension") and TEST_WITH_ROCM):
+            continue
         add_test(test)
     for test_params in module_tests:
         test_params = deepcopy(test_params)
diff --git a/test/test_nn.py b/test/test_nn.py
index 994f0af89f9..209b2cd0efa 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -29,7 +29,7 @@ from torch.autograd import Variable, gradcheck
 from torch.autograd.gradcheck import gradgradcheck
 from torch.nn import Parameter
 from torch.nn.parallel._functions import Broadcast
-from common import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, \
+from common import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, TEST_WITH_ROCM, \
     TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, download_file, PY3, PY34, to_gpu, \
     get_function_arglist, skipCUDAMemoryLeakCheckIf
 from common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, \
@@ -47,8 +47,10 @@ if TEST_NUMPY:
     import numpy as np
 
 ALL_TENSORTYPES = [torch.float,
-                   torch.double,
-                   torch.half]
+                   torch.double]
+
+if not TEST_WITH_ROCM:
+    ALL_TENSORTYPES.append(torch.half)
 
 NO_HALF_TENSORTYPES = [torch.float,
                        torch.double]
@@ -237,6 +239,7 @@ class NewModuleTest(InputVariableMixin, ModuleTest):
         self.cudnn = kwargs.get('cudnn', False)
         self.check_inplace = kwargs.get('check_inplace', False)
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.skip_double = kwargs.get('skip_double', False)
 
     def _do_test(self, test_case, module, input):
         test_case.check_jacobian(module, input, self.jacobian_input)
@@ -357,21 +360,23 @@ class NewModuleTest(InputVariableMixin, ModuleTest):
                         test_case.assertIsInstance(p, torch.cuda.FloatTensor)
                         test_case.assertEqual(p.get_device(), 1)
 
-                # test double()
-                input = input.double().cuda()
-                module.double().cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.DoubleTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                if not self.skip_double:
+                    # test double()
+                    input = input.double().cuda()
+                    module.double().cuda()
+                    module(input)
+                    for p in module.parameters():
+                        test_case.assertIsInstance(p, torch.cuda.DoubleTensor)
+                        test_case.assertEqual(p.get_device(), 0)
 
-                # test half()
-                input = input.half().cuda()
-                module.half().cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.HalfTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                if not TEST_WITH_ROCM:
+                    # test half()
+                    input = input.half().cuda()
+                    module.half().cuda()
+                    module(input)
+                    for p in module.parameters():
+                        test_case.assertIsInstance(p, torch.cuda.HalfTensor)
+                        test_case.assertEqual(p.get_device(), 0)
 
     def _get_target(self):
         return self._get_arg('target', False)
@@ -1845,6 +1850,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_embedding_max_norm_cuda(self, dtype=torch.float):
         embedding = nn.Embedding(22, 5, max_norm=1.0).to("cuda", dtype=dtype)
         # nn.Embedding only takes LongTensor as input
@@ -1932,6 +1938,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_gumbel_softmax_st_cuda(self, dtype=torch.float):
         self._test_gumbel_softmax_st(True, dtype=dtype)
 
@@ -2099,6 +2106,7 @@ class TestNN(NNTestCase):
         y.backward(grad)
 
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    @skipIfRocm
     def test_contig_wrong_stride_cudnn(self):
         # x has to have batch_size 1 to test contiguous checks
         x = torch.randn(1, 16, 5, 5, device="cuda")
@@ -2120,6 +2128,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_embedding_bag_cuda(self, dtype=torch.float):
         self._test_EmbeddingBag(True, 'sum', False, dtype)
         self._test_EmbeddingBag(True, 'mean', False, dtype)
@@ -2169,11 +2178,13 @@ class TestNN(NNTestCase):
         self._test_dropout(nn.Dropout3d, False, input)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_Dropout_cuda(self):
         input = torch.Tensor(1000)
         self._test_dropout(nn.Dropout, True, input)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_Dropout2d_cuda(self):
         b = random.randint(1, 5)
         w = random.randint(1, 5)
@@ -2183,6 +2194,7 @@ class TestNN(NNTestCase):
         self._test_dropout(nn.Dropout2d, True, input)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_Dropout3d_cuda(self):
         b = random.randint(1, 5)
         w = random.randint(1, 5)
@@ -2288,6 +2300,7 @@ class TestNN(NNTestCase):
         self._test_InstanceNorm_general(nn.InstanceNorm1d, input, dtype=torch.float)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_InstanceNorm1d_general_cuda(self):
         b = random.randint(3, 5)
         c = random.randint(3, 5)
@@ -2307,6 +2320,7 @@ class TestNN(NNTestCase):
         self._test_InstanceNorm_general(nn.InstanceNorm2d, input, dtype=torch.float)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_InstanceNorm2d_general_cuda(self):
         b = random.randint(3, 5)
         c = random.randint(3, 5)
@@ -2328,6 +2342,7 @@ class TestNN(NNTestCase):
         self._test_InstanceNorm_general(nn.InstanceNorm3d, input, dtype=torch.float)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_InstanceNorm3d_general_cuda(self):
         b = random.randint(3, 5)
         c = random.randint(2, 5)
@@ -2392,6 +2407,7 @@ class TestNN(NNTestCase):
         self._test_LayerNorm_general()
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_LayerNorm_general_cuda(self):
         self._test_LayerNorm_general("cuda")
         self._test_LayerNorm_cuda_half()
@@ -2456,6 +2472,7 @@ class TestNN(NNTestCase):
         self._test_GroupNorm_general(dtype=torch.float)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_GroupNorm_general_cuda(self):
         self._test_GroupNorm_general("cuda", torch.float)
         self._test_GroupNorm_cuda_half()
@@ -2571,6 +2588,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_Conv2d_naive_groups_cuda(self, dtype=torch.float):
         self._test_Conv2d_naive_groups("cuda", dtype)
 
@@ -2578,6 +2596,7 @@ class TestNN(NNTestCase):
         self._test_batchnorm_eval()
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_batchnorm_eval_cuda(self, dtype=torch.float):
         self._test_batchnorm_eval("cuda", dtype)
 
@@ -2585,6 +2604,7 @@ class TestNN(NNTestCase):
         self._test_batchnorm_simple_average()
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_batchnorm_simple_average_cuda(self):
         self._test_batchnorm_simple_average(torch.cuda.FloatTensor)
 
@@ -2609,6 +2629,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_MaxPool3d_indices_cuda(self, dtype=torch.float):
         self._test_maxpool_indices(3, device="cuda", dtype=dtype)
 
@@ -2617,6 +2638,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_AdaptiveMaxPool1d_indices_cuda(self, dtype=torch.float):
         self._test_maxpool_indices(1, adaptive=True, device="cuda", dtype=dtype)
 
@@ -2625,6 +2647,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_AdaptiveMaxPool2d_indices_cuda(self, dtype=torch.float):
         self._test_maxpool_indices(2, adaptive=True, device="cuda", dtype=dtype)
 
@@ -2632,6 +2655,7 @@ class TestNN(NNTestCase):
         self._test_maxpool_indices(3, adaptive=True)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     @repeat_test_for_types(ALL_TENSORTYPES)
     def test_AdaptiveMaxPool3d_indices_cuda(self, dtype=torch.float):
         self._test_maxpool_indices(3, adaptive=True, device="cuda", dtype=dtype)
@@ -2669,10 +2693,12 @@ class TestNN(NNTestCase):
         _assertGradAndGradgradChecks(self, lambda y: dp.scatter(y, (0, 1)), (x,))
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_scatter_cpu(self):
         self._test_scatter(torch.randn(4, 4))
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_scatter_gpu(self):
         self._test_scatter(torch.randn(4, 4).cuda())
 
@@ -2719,14 +2745,17 @@ class TestNN(NNTestCase):
         _assertGradAndGradgradChecks(self, lambda x, y: dp.gather((x, y), output_device), inputs)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_gather_cpu(self):
         self._test_gather(-1)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_gather_gpu(self):
         self._test_gather(0)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_gather_different_len_dicts(self):
         inputs = (
             {'a': Variable(torch.randn(1, 2).cuda(0), requires_grad=True)},
@@ -2743,12 +2772,14 @@ class TestNN(NNTestCase):
         _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), variables)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_broadcast_double_backwards_gpu(self):
         self._test_broadcast_double_backwards(torch.randn(4, 4).cuda(),
                                               torch.randn(4, 4).cuda(),
                                               torch.randn(4, 4).cuda())
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_broadcast_not_requiring_grad(self):
         variables = [
             Variable(torch.randn(1, 2).cuda(), requires_grad=True),
@@ -2763,6 +2794,7 @@ class TestNN(NNTestCase):
             self.assertEqual(input_var.requires_grad, broadcasted_var.requires_grad)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_broadcast_no_grad(self):
         x = torch.randn(1, 2, dtype=torch.float32, requires_grad=True, device='cuda')
         with torch.no_grad():
@@ -2772,6 +2804,7 @@ class TestNN(NNTestCase):
             self.assertFalse(output.requires_grad)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_replicate(self):
         module = nn.Linear(10, 5).float().cuda()
         input = Variable(torch.randn(2, 10).float().cuda())
@@ -2784,6 +2817,7 @@ class TestNN(NNTestCase):
             self.assertEqual(replica(replica_input).data, expected_output)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_replicate_buffers(self):
         net = nn.Module()
         net.bn = nn.BatchNorm2d(10)
@@ -2795,6 +2829,7 @@ class TestNN(NNTestCase):
             self.assertEqual(replica.bn.num_batches_tracked.get_device(), i, 'buffer on wrong device')
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_parallel_apply(self):
         l1 = nn.Linear(10, 5).to("cuda:0", torch.float)
         l2 = nn.Linear(10, 5).to("cuda:1", torch.float)
@@ -2813,6 +2848,7 @@ class TestNN(NNTestCase):
                 self.assertEqual(out.data, expected)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_multiple_input(self):
         class TestModule(nn.Module):
 
@@ -2879,6 +2915,7 @@ class TestNN(NNTestCase):
         local_test(out)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_small_back(self):
         l = nn.Linear(10, 5).float().cuda()
         i = Variable(torch.randn(20, 10).float().cuda())
@@ -2886,6 +2923,7 @@ class TestNN(NNTestCase):
         self.assertEqual(out, l(i))
 
     @unittest.skipIf(not TEST_MULTIGPU or not PY3, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_model_no_refcycles(self):
         # Python 2.7 will create reference cycles with the following
         # Module on multiple GPUs, but Python 3 shouldn't unless
@@ -2909,6 +2947,7 @@ class TestNN(NNTestCase):
         self.assertEqual(refcycles, 0)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_no_grad(self):
         test = self
 
@@ -2924,6 +2963,7 @@ class TestNN(NNTestCase):
         self.assertRaises(AssertionError, lambda: dp.data_parallel(l, i, (0, 1)))
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel(self):
         l = nn.Linear(10, 5).float().cuda()
         i = Variable(torch.randn(20, 10).float().cuda(1))
@@ -2952,6 +2992,7 @@ class TestNN(NNTestCase):
         out = dp.data_parallel(l, i)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_sparse(self):
         l = nn.Embedding(10, 5, sparse=True).to("cuda:1")
         i = torch.randint(10, (20, 5), device="cuda:1", dtype=torch.long)
@@ -2979,6 +3020,7 @@ class TestNN(NNTestCase):
         out = dp.data_parallel(l, i)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_nested_output(self):
         def fn(input):
             return [
@@ -3010,6 +3052,7 @@ class TestNN(NNTestCase):
         self.assertIsInstance(output[3]['b'][0], torch.Tensor)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_data_parallel_nested_input(self):
         def fn(input):
             return input[1][0]
@@ -3358,6 +3401,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @skipIfRocm
     def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
         inputs = Variable(torch.randn(4, 1, 7, 7).float().cuda())
         weights = Variable(torch.randn(1, 1, 3, 3).double().cuda())
@@ -3374,6 +3418,7 @@ class TestNN(NNTestCase):
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_Conv2d_deterministic_cudnn(self, dtype=torch.float):
         inputs = torch.randn(2, 3, 5, 5, device="cuda", dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
@@ -3510,6 +3555,7 @@ class TestNN(NNTestCase):
 
     # For https://github.com/pytorch/pytorch/pull/1273
     # Almost identical to the above `test_Conv2d_naive_groups`
+    @skipIfRocm
     def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
@@ -3544,6 +3590,7 @@ class TestNN(NNTestCase):
     # Very similar to test_Conv2d_naive_groups but with special care to handle
     # the number of groups == number of input channels
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @skipIfRocm
     @repeat_test_for_types(ALL_TENSORTYPES)
     def test_Conv2d_depthwise_naive_groups_cuda(self, dtype=torch.float):
         for depth_multiplier in [1, 2]:
@@ -3891,6 +3938,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_variable_sequence_cuda(self, dtype=torch.float):
         self._test_variable_sequence("cuda", dtype)
 
@@ -3908,6 +3956,7 @@ class TestNN(NNTestCase):
             (hx + cx).sum().backward()
 
     @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available')
+    @skipIfRocm
     def test_cudnn_rnn_dropout_states_device(self):
         rnn = nn.RNN(10, 20, num_layers=2, dropout=.5)
         device = 1
@@ -3917,6 +3966,7 @@ class TestNN(NNTestCase):
         output = rnn(input, hx)
 
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @skipIfRocm
     def test_cudnn_weight_format(self):
         rnns = [
             nn.LSTM(10, 20, batch_first=True),
@@ -3964,6 +4014,7 @@ class TestNN(NNTestCase):
             self.assertEqual(weight_data, all_vars[4].data)
 
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @skipIfRocm
     def test_cudnn_weight_tying(self):
         rnns = [
             nn.LSTM(10, 20, batch_first=True, bidirectional=True),
@@ -3997,6 +4048,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @repeat_test_for_types(NO_HALF_TENSORTYPES)
+    @skipIfRocm
     def test_cuda_rnn_fused(self, dtype=torch.float):
 
         def copy_rnn(rnn1, rnn2):
@@ -4161,6 +4213,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_rnn_retain_variables_cuda(self, dtype=torch.float):
         with torch.backends.cudnn.flags(enabled=False):
             self._test_rnn_retain_variables("cuda", dtype)
@@ -4308,6 +4361,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
+    @skipIfRocm
     def test_RNN_cpu_vs_cudnn_no_dropout(self):
         self._test_RNN_cpu_vs_cudnn(0)
 
@@ -4462,6 +4516,7 @@ class TestNN(NNTestCase):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @repeat_test_for_types(ALL_TENSORTYPES)
+    @skipIfRocm
     def test_noncontig_conv_grad_cuda(self, dtype=torch.float):
         # FIXME: remove after adding non-contiguous grad tests for all modules
         module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to("cuda", dtype)
@@ -4661,6 +4716,7 @@ class TestNN(NNTestCase):
         gradgradcheck(func, [v])
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_batchnorm_cudnn_half(self):
         # THNN
         input = torch.randint(1, 10, (2, 3, 2, 2), dtype=torch.half, device="cuda", requires_grad=True)
@@ -4708,6 +4764,7 @@ class TestNN(NNTestCase):
         self._test_batchnorm_update_stats()
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_batchnorm_update_stats_cuda(self):
         self._test_batchnorm_update_stats("cuda", torch.float)
 
@@ -5002,6 +5059,7 @@ class TestNN(NNTestCase):
             with self.assertRaisesRegex(RuntimeError, "expected input and grid to be on same device"):
                 F.grid_sample(input.cuda(), grid)
 
+    @skipIfRocm
     def test_grid_sample(self):
         def test(N, C, H, W, mode, padding_mode):
             def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
@@ -5149,6 +5207,7 @@ class TestNN(NNTestCase):
                     with cudnn.flags(enabled=False):
                         test(N, C, H, W, mode, padding_mode)
 
+    @skipIfRocm
     def test_grid_sample_3d(self):
         def test(N, C, D, H, W, mode, padding_mode):
             def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode):
@@ -5262,6 +5321,7 @@ class TestNN(NNTestCase):
 
                 test(N, C, D, H, W, mode, padding_mode)
 
+    @skipIfRocm
     def test_affine_grid(self):
         # test known input on CPU
         input = torch.arange(1., 7).view(1, 2, 3)
@@ -5760,6 +5820,7 @@ class TestNN(NNTestCase):
         self._test_conv_noncontig_weights(self, torch.device('cpu'))
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_conv_noncontig_weights_cuda(self):
         self._test_conv_noncontig_weights(self, torch.device('cuda'))
 
@@ -5886,6 +5947,7 @@ class TestNN(NNTestCase):
                                                no_weight)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @skipIfRocm
     def test_cudnn_noncontiguous_weight(self):
         # Noncontiguous weights must be contiguous() before being
         # passed to cuDNN
@@ -6494,9 +6556,12 @@ def add_test(test, decorator=None):
             test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs))
         add(cuda_test_name + '_double', lambda self,
             test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs))
+
+        @skipIfRocm
+        def test_half(self, test=test, kwargs=kwargs):
+            test.test_cuda(self, dtype=torch.half, **kwargs)
         if getattr(test, 'check_half', True):
-            add(cuda_test_name + '_half', lambda self,
-                test=test: test.test_cuda(self, dtype=torch.half, **kwargs))
+            add(cuda_test_name + '_half', test_half)
     else:
         add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs))
 
@@ -6512,14 +6577,16 @@ new_criterion_tests = [
     dict(
         module_name='BCEWithLogitsLoss',
         input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
-        target_fn=lambda: torch.randn(15, 10).gt(0).double()
+        target_fn=lambda: torch.randn(15, 10).gt(0).double(),
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='BCEWithLogitsLoss',
         constructor_args=(torch.rand(10),),
         input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
         target_fn=lambda: torch.randn(15, 10).gt(0).double(),
-        desc='weights'
+        desc='weights',
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='BCEWithLogitsLoss',
@@ -6535,7 +6602,8 @@ new_criterion_tests = [
         reference_fn=lambda i, t, m:
             loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
-        desc='2d'
+        desc='2d',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='NLLLoss',
@@ -6545,6 +6613,7 @@ new_criterion_tests = [
         reference_fn=lambda i, t, m:
             loss_reference_fns['NLLLossNd'](i, t, weight=get_weight(m)),
         desc='2d_weights',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='NLLLoss',
@@ -6554,6 +6623,7 @@ new_criterion_tests = [
         reference_fn=lambda i, t, m:
             loss_reference_fns['NLLLossNd'](i, t, ignore_index=1),
         desc='2d_ignore_index',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='NLLLoss',
@@ -6562,7 +6632,8 @@ new_criterion_tests = [
         reference_fn=lambda i, t, m:
             loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
-        desc='higher_dim'
+        desc='higher_dim',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='NLLLoss',
@@ -6571,13 +6642,15 @@ new_criterion_tests = [
         reference_fn=lambda i, t, m:
             loss_reference_fns['NLLLossNd'](i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
-        desc='dim_is_3'
+        desc='dim_is_3',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='PoissonNLLLoss',
         input_size=(2, 3, 4, 5),
         target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
         desc='no_full_loss',  # without sterling approx
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='PoissonNLLLoss',
@@ -6585,6 +6658,7 @@ new_criterion_tests = [
         input_fn=lambda: torch.randn(2, 3, 4, 5).abs_().add_(0.001),
         target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
         desc='full_loss',  # with sterling approx
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='L1Loss',
@@ -6657,6 +6731,7 @@ new_criterion_tests = [
         desc='weights',
         check_sum_reduction=True,
         check_gradgrad=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='CTCLoss',
@@ -6669,6 +6744,7 @@ new_criterion_tests = [
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='CTCLoss',
@@ -6682,6 +6758,7 @@ new_criterion_tests = [
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='CTCLoss',
@@ -6696,6 +6773,7 @@ new_criterion_tests = [
         check_gradgrad=False,
         check_half=False,
         convert_target=False,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='CTCLoss',
@@ -6710,6 +6788,7 @@ new_criterion_tests = [
         check_gradgrad=False,
         check_half=False,
         convert_target=False,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
 ]
 
@@ -6721,7 +6800,7 @@ def poissonnllloss_no_reduce_test():
         constructor=wrap_functional(
             lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none')),
         input_fn=lambda: torch.rand(10, 10),
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def bceloss_no_reduce_test():
@@ -6733,7 +6812,7 @@ def bceloss_no_reduce_test():
         input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()),
         check_gradgrad=False,
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def bceloss_no_reduce_scalar_test():
@@ -6759,7 +6838,7 @@ def bceloss_weights_no_reduce_test():
         input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def bceloss_weights_no_reduce_scalar_test():
@@ -6786,7 +6865,8 @@ def bce_with_logistic_no_reduce_test():
         input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, m: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        decorator=skipIfRocm)
 
 
 def bce_with_logistic_no_reduce_scalar_test():
@@ -6799,7 +6879,8 @@ def bce_with_logistic_no_reduce_scalar_test():
         input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, m: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        decorator=skipIfRocm)
 
 
 def kldivloss_with_target_no_reduce_test():
@@ -6811,7 +6892,7 @@ def kldivloss_with_target_no_reduce_test():
         input_fn=lambda: torch.rand(10, 10),
         reference_fn=lambda t, _:
             loss_reference_fns['KLDivLoss'](i.type_as(t), t, reduction='none'),
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def kldivloss_no_reduce_test():
@@ -6823,7 +6904,8 @@ def kldivloss_no_reduce_test():
         input_fn=lambda: torch.rand(10, 10).log(),
         reference_fn=lambda i, _:
             loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
+        pickle=False,
+        decorator=skipIfRocm)
 
 
 def kldivloss_no_reduce_scalar_test():
@@ -6846,7 +6928,7 @@ def l1loss_no_reduce_test():
             lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
         input_fn=lambda: torch.randn(2, 3, 4),
         reference_fn=lambda i, m: (i - t.type_as(i)).abs(),
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def l1loss_no_reduce_scalar_test():
@@ -7058,7 +7140,7 @@ def smoothl1loss_no_reduce_test():
         input_fn=lambda: torch.randn(2, 3, 4),
         reference_fn=lambda i, _:
             loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def smoothl1loss_no_reduce_scalar_test():
@@ -7150,7 +7232,7 @@ def softmarginloss_no_reduce_test():
         input_fn=lambda: torch.randn(5, 5),
         reference_fn=lambda i, _:
             loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'),
-        pickle=False)
+        pickle=False, test_cuda=(not TEST_WITH_ROCM))
 
 
 def multilabelsoftmarginloss_no_reduce_test():
@@ -7163,7 +7245,7 @@ def multilabelsoftmarginloss_no_reduce_test():
         reference_fn=lambda i, m:
             (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log())).sum(dim=1) / i.size(1),
         check_gradgrad=False,
-        pickle=False)
+        pickle=False, decorator=skipIfRocm)
 
 
 def multilabelsoftmarginloss_weights_no_reduce_test():
@@ -7179,7 +7261,7 @@ def multilabelsoftmarginloss_weights_no_reduce_test():
             (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights).sum(dim=1) / i.size(1),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False, decorator=skipIfRocm)
 
 
 def multimarginloss_no_reduce_test():
@@ -7304,6 +7386,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='affine',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -7312,6 +7396,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='3d_input',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -7320,6 +7406,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='affine_simple_average',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -7328,6 +7416,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='not_affine',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -7336,6 +7426,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='not_tracking_stats',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -7344,6 +7436,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='3d_input_not_affine',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -7351,6 +7445,8 @@ new_module_tests = [
         input_size=(2, 3, 6, 6),
         cudnn=True,
         check_eval=True,
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -7359,6 +7455,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='2d_simple_average',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -7367,6 +7465,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='momentum',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -7375,6 +7475,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='not_affine',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -7383,6 +7485,8 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='not_tracking_stats',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='BatchNorm3d',
@@ -7390,6 +7494,7 @@ new_module_tests = [
         input_size=(2, 3, 4, 4, 4),
         cudnn=True,
         check_eval=True,
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='BatchNorm3d',
@@ -7398,6 +7503,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='3d_simple_average',
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='BatchNorm3d',
@@ -7406,6 +7512,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='momentum',
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='BatchNorm3d',
@@ -7414,6 +7521,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='not_affine',
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='BatchNorm3d',
@@ -7422,6 +7530,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='not_tracking_stats',
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='InstanceNorm1d',
@@ -7429,6 +7538,7 @@ new_module_tests = [
         input_size=(4, 3, 15),
         cudnn=True,
         check_eval=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='InstanceNorm1d',
@@ -7437,6 +7547,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='tracking_stats',
+        decorator=skipIfRocm
     ),
     dict(
         module_name='InstanceNorm2d',
@@ -7444,6 +7555,7 @@ new_module_tests = [
         input_size=(2, 3, 6, 6),
         cudnn=True,
         check_eval=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='InstanceNorm2d',
@@ -7452,6 +7564,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='tracking_stats',
+        decorator=skipIfRocm
     ),
     dict(
         module_name='InstanceNorm3d',
@@ -7459,6 +7572,7 @@ new_module_tests = [
         input_size=(2, 3, 4, 4, 4),
         cudnn=True,
         check_eval=True,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='InstanceNorm3d',
@@ -7467,6 +7581,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='tracking_stats',
+        decorator=skipIfRocm
     ),
     dict(
         module_name='LayerNorm',
@@ -7475,6 +7590,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='1d_elementwise_affine',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LayerNorm',
@@ -7483,6 +7599,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='1d_no_elementwise_affine',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LayerNorm',
@@ -7491,6 +7608,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='3d_elementwise_affine',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LayerNorm',
@@ -7499,6 +7617,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='3d_no_elementwise_affine',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GroupNorm',
@@ -7507,6 +7626,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='1d_affine',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GroupNorm',
@@ -7514,7 +7634,8 @@ new_module_tests = [
         input_size=(4, 5, 5),
         cudnn=True,
         check_eval=True,
-        desc='1d_no_affine_IN',  # this setting is equivalent with InstanceNorm
+        desc='1d_no_affine_IN',  # this setting is equivalent with InstanceNormi
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GroupNorm',
@@ -7523,6 +7644,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='1d_no_affine_LN',  # this setting is equivalent with LayerNorm
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GroupNorm',
@@ -7531,6 +7653,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='2d_affine',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GroupNorm',
@@ -7539,6 +7662,7 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='2d_no_affine_IN',  # this setting is equivalent with InstanceNorm
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GroupNorm',
@@ -7547,12 +7671,15 @@ new_module_tests = [
         cudnn=True,
         check_eval=True,
         desc='2d_no_affine_LN',  # this setting is equivalent with LayerNorm
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='Conv1d',
         constructor_args=(4, 5, 3),
         input_size=(2, 4, 10),
         cudnn=True,
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv1d',
@@ -7560,51 +7687,66 @@ new_module_tests = [
         input_size=(2, 4, 10),
         cudnn=True,
         desc='stride',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv1d',
         constructor_args=(4, 5, 3, 1, 1),
         input_size=(2, 4, 10),
         cudnn=True,
-        desc='pad1'
+        desc='pad1',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv1d',
         constructor_args=(4, 5, 5, 1, 2),
         input_size=(2, 4, 10),
         cudnn=True,
-        desc='pad2'
+        desc='pad2',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv1d',
         constructor_args=(4, 4, 3, 1, 1),
         input_size=(1, 4, 1),
         cudnn=True,
-        desc='pad1size1'
+        desc='pad1size1',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv1d',
         constructor_args=(4, 4, 5, 1, 2),
         input_size=(1, 4, 1),
         cudnn=True,
-        desc='pad2size1'
+        desc='pad2size1',
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv1d_dilated',
         constructor=lambda: nn.Conv1d(4, 5, kernel_size=3, dilation=2),
         input_size=(2, 4, 10),
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv1d_groups',
         constructor=lambda: nn.Conv1d(4, 6, kernel_size=3, groups=2),
         input_size=(2, 4, 6),
         cudnn=True,
+        skip_double=TEST_WITH_ROCM,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='ConvTranspose1d',
         constructor=lambda: nn.ConvTranspose1d(3, 4, kernel_size=3, stride=(3,), padding=1, output_padding=(1,)),
         cudnn=True,
         input_size=(1, 3, 7),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='ConvTranspose1d',
@@ -7612,6 +7754,7 @@ new_module_tests = [
         input_size=(1, 3, 6),
         cudnn=True,
         desc='no_bias',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='ConvTranspose1d',
@@ -7619,12 +7762,14 @@ new_module_tests = [
         input_size=(1, 3, 6),
         cudnn=True,
         desc='dilated',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='ConvTranspose1d_groups',
         constructor=lambda: nn.ConvTranspose1d(4, 6, 3, stride=(3,), padding=1, output_padding=(1,), groups=2),
         cudnn=True,
         input_size=(2, 4, 7),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='MaxPool1d',
@@ -7642,6 +7787,7 @@ new_module_tests = [
         constructor_args=(3, 4, (3, 2)),
         input_size=(2, 3, 7, 5),
         cudnn=True,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv2d',
@@ -7649,6 +7795,7 @@ new_module_tests = [
         input_size=(2, 3, 6, 6),
         cudnn=True,
         desc='strided',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv2d',
@@ -7656,6 +7803,7 @@ new_module_tests = [
         input_size=(2, 3, 6, 6),
         cudnn=True,
         desc='padding',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv2d',
@@ -7663,6 +7811,7 @@ new_module_tests = [
         input_size=(2, 3, 8, 8),
         cudnn=True,
         desc='dilated',
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='Conv2d',
@@ -7670,23 +7819,27 @@ new_module_tests = [
         input_size=(2, 3, 6, 5),
         cudnn=True,
         desc='no_bias',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv2d_groups',
         constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
         input_size=(2, 4, 6, 5),
         cudnn=True,
+        decorator=skipIfRocm,
     ),
     dict(
         fullname='Conv2d_groups_thnn',
         constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
         input_size=(2, 4, 6, 5),
+        decorator=skipIfRocm,
     ),
     dict(
         module_name='ConvTranspose2d',
         constructor_args=(3, 4, 3, (3, 2), 1, (1, 1)),
         cudnn=True,
         input_size=(1, 3, 7, 6),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='ConvTranspose2d',
@@ -7694,6 +7847,7 @@ new_module_tests = [
         input_size=(1, 3, 6, 7),
         cudnn=True,
         desc='dilated',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='ConvTranspose2d',
@@ -7701,37 +7855,44 @@ new_module_tests = [
         input_size=(1, 3, 6, 7),
         cudnn=True,
         desc='no_bias',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='ConvTranspose2d_groups',
         constructor=lambda: nn.ConvTranspose2d(2, 4, (2, 3), groups=2),
         input_size=(1, 2, 4, 5),
         cudnn=True,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv2d_depthwise',
         constructor=lambda: nn.Conv2d(4, 4, (3, 3), groups=4),
         input_size=(2, 4, 6, 6),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv2d_depthwise_with_multiplier',
         constructor=lambda: nn.Conv2d(4, 8, (3, 3), groups=4),
         input_size=(2, 4, 6, 6),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv2d_depthwise_strided',
         constructor=lambda: nn.Conv2d(4, 4, (3, 3), stride=(2, 2), groups=4),
         input_size=(2, 4, 6, 6),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv2d_depthwise_padded',
         constructor=lambda: nn.Conv2d(4, 4, (3, 3), padding=(1, 1), groups=4),
         input_size=(2, 4, 6, 6),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv2d_depthwise_dilated',
         constructor=lambda: nn.Conv2d(4, 4, (2, 2), dilation=(2, 2), groups=4),
         input_size=(2, 4, 5, 5),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='MaxPool2d',
@@ -7798,19 +7959,22 @@ new_module_tests = [
         module_name='LocalResponseNorm',
         constructor_args=(3, ),
         input_size=(1, 5, 7),
-        desc='1d'
+        desc='1d',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LocalResponseNorm',
         constructor_args=(2, ),
         input_size=(1, 5, 7, 7),
-        desc='2d_uneven_pad'
+        desc='2d_uneven_pad',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='LocalResponseNorm',
         constructor_args=(1, 1, 0.5, 2),
         input_size=(1, 5, 7, 7, 7),
-        desc='3d_custom_params'
+        desc='3d_custom_params',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='ReflectionPad1d',
@@ -7863,6 +8027,7 @@ new_module_tests = [
         constructor_args=(3, 4, (2, 3, 4)),
         input_size=(2, 3, 3, 4, 5),
         cudnn=True,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv3d',
@@ -7870,6 +8035,7 @@ new_module_tests = [
         input_size=(2, 3, 3, 4, 5),
         cudnn=True,
         desc='no_bias',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv3d',
@@ -7877,6 +8043,7 @@ new_module_tests = [
         input_size=(2, 3, 5, 5, 5),
         cudnn=True,
         desc='stride',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='Conv3d',
@@ -7884,28 +8051,33 @@ new_module_tests = [
         input_size=(2, 3, 5, 5, 5),
         cudnn=True,
         desc='stride_padding',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv3d_groups',
         constructor=lambda: nn.Conv3d(4, 6, kernel_size=3, groups=2),
         input_size=(2, 4, 4, 5, 4),
         cudnn=True,
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv3d_dilated',
         constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2),
         input_size=(2, 3, 5, 5, 5),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         fullname='Conv3d_dilated_strided',
         constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2, stride=2),
         input_size=(2, 3, 5, 5, 5),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='ConvTranspose3d',
         constructor_args=(2, 3, (2, 3, 2)),
         cudnn=True,
         input_size=(1, 2, 4, 5, 4),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -7913,6 +8085,7 @@ new_module_tests = [
         cudnn=True,
         input_size=(1, 2, 4, 5, 4),
         desc='dilated',
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='MaxPool3d',
@@ -7991,6 +8164,8 @@ new_module_tests = [
         jacobian_input=False,
         check_gradgrad=False,
         desc='mean',
+        test_cuda=(not TEST_WITH_ROCM),
+        decorator=skipIfRocm
     ),
     dict(
         module_name='EmbeddingBag',
@@ -7999,6 +8174,8 @@ new_module_tests = [
         jacobian_input=False,
         check_gradgrad=False,
         desc='sum',
+        test_cuda=(not TEST_WITH_ROCM),
+        decorator=skipIfRocm
     ),
     dict(
         module_name='EmbeddingBag',
@@ -8014,6 +8191,8 @@ new_module_tests = [
         input_fn=lambda: torch.randperm(2).repeat(1, 2),
         jacobian_input=False,
         check_gradgrad=False,
+        decorator=skipIfRocm,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=lambda: nn.Embedding(4, 3, sparse=True),
@@ -8021,6 +8200,7 @@ new_module_tests = [
         jacobian_input=False,
         fullname='Embedding_sparse',
         check_gradgrad=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=lambda: nn.FractionalMaxPool2d(
@@ -8310,7 +8490,8 @@ new_module_tests = [
         input_size=(3, 2, 5),
         constructor_args=(2.,),
         check_inplace=True,
-        reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1))
+        reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)),
+        test_cuda=(not TEST_WITH_ROCM),
     ),
     dict(
         module_name='CELU',
@@ -8323,30 +8504,35 @@ new_module_tests = [
     dict(
         module_name='GLU',
         input_size=(5, 6),
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='GLU',
         constructor_args=(1,),
         input_size=(5, 6, 7),
-        desc='dim'
+        desc='dim',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=-1),
         input_size=(2, 128),  # trigger the last-dim algo in CUDA
         fullname='softmax_lastdim',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=1),
         input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
         fullname='softmax_spatial_special',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=1),
         input_size=(2, 2, 4, 4),  # regular spatial algorithm
         fullname='softmax_spatial',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=0),
@@ -8374,30 +8560,35 @@ new_module_tests = [
         input_size=(2, 128),  # trigger the last-dim algo in CUDA
         fullname='log_softmax_lastdim',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=1),
         input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
         fullname='log_softmax_spatial_special',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=1),
         input_size=(2, 2, 4, 4),  # regular spatial algorithm
         fullname='log_softmax_spatial',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=0),
         input_size=(2, 3, 4, 5),
         fullname='log_softmax_dim0',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=3),
         input_size=(2, 3, 4, 5),
         fullname='log_softmax_dim3',
         pickle=False,
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=0),
@@ -8489,6 +8680,7 @@ new_module_tests = [
         input_size=(),
         reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(0, False)).log_(),
         desc='multiparam_scalar',
+        test_cuda=(not TEST_WITH_ROCM)
     ),
     dict(
         module_name='ELU',
@@ -8638,11 +8830,10 @@ class _AdaptiveLogSoftmaxWithLoss(nn.AdaptiveLogSoftmaxWithLoss):
         t = torch.tensor([0, 1, 4, 8]).to(input.device)
         return nn.AdaptiveLogSoftmaxWithLoss.__call__(self, input, t).output
 
-
 add_test(NewModuleTest(
     constructor=lambda: _AdaptiveLogSoftmaxWithLoss(16, 10, [2, 6]),
     input_size=(4, 16),
-    fullname='AdaptiveLogSoftmax'))
+    fullname='AdaptiveLogSoftmax'), decorator=skipIfRocm)
 
 
 # The following are helpers for TestNN.test_affine_*
diff --git a/test/test_torch.py b/test/test_torch.py
index 8592e0b03e3..5747407f86a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3444,7 +3444,6 @@ class TestTorch(TestCase):
         self.assertRaises(TypeError, lambda: q.topk(4, True))
 
     @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
-    @skipIfRocm
     def test_topk_noncontiguous_gpu(self):
         t = torch.randn(20, device="cuda")[::2]
         top1, idx1 = t.topk(5)
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 55f327243f0..74eb2669300 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -75,6 +75,7 @@
           "struct curandStateMtgp32*": "curandStateMtgp32*",
           "__host__ void THCRandom_getRNGState": "extern \"C\" __host__ void THCRandom_getRNGState",
           "__host__ void THCRandom_setRNGState": "extern \"C\" __host__ void THCRandom_setRNGState",
+          "state[threadIdx.x].k = kernel;" : "state[threadIdx.x].set_params(kernel);"
         }
       },
       {
@@ -96,27 +97,6 @@
           "struct mtgp32_kernel_params": "mtgp32_kernel_params"
         }
       },
-      {
-        "path": "aten/src/ATen/native/cuda/CuFFTUtils.h",
-        "s_constants": {
-          "#include <cufft.h>": "",
-          "#include <cufftXt.h>": ""
-        }
-      },
-      {
-        "path": "aten/src/ATen/native/cuda/CuFFTPlanCache.h",
-        "s_constants": {
-          "#include <cufft.h>": "",
-          "#include <cufftXt.h>": ""
-        }
-      },
-      {
-        "path": "aten/src/ATen/native/cuda/SpectralOps.cu",
-        "s_constants": {
-          "#include <cufft.h>": "",
-          "#include <cufftXt.h>": ""
-        }
-      },
       {
         "path": "aten/src/ATen/native/cuda/RoiPooling.cu",
         "s_constants": {
@@ -141,9 +121,6 @@
       }
     ],
   "disabled_modules": [
-    "aten/src/ATen/native/cuda/CuFFTUtils.h",
-    "aten/src/ATen/native/cuda/CuFFTPlanCache.h",
-    "aten/src/ATen/native/cuda/SpectralOps.cu",
   ],
   "disabled_functions": [
     {
@@ -205,13 +182,6 @@
       "functions": [
         "THCTensor_(getTextureObject)"
       ]
-    },
-    {
-      "path": "aten/src/THC/THCTensorRandom.cu",
-      "functions": [
-        "THCRandom_setRNGState",
-        "set_rngstate_kernel"
-      ]
     }
   ]
 }
diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py
index 092de16cff7..9dc4b5ee348 100644
--- a/tools/amd_build/pyHIPIFY/constants.py
+++ b/tools/amd_build/pyHIPIFY/constants.py
@@ -50,7 +50,8 @@ API_BLAS = 39
 API_SPARSE = 40
 API_RAND = 41
 API_LAST = 42
+API_FFT = 43
 
 HIP_UNSUPPORTED = 43
 API_PYTORCH = 1337
-API_CAFFE2 = 1338
\ No newline at end of file
+API_CAFFE2 = 1338
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index b1ace1b44d1..6bf931c5893 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -271,8 +271,8 @@ CUDA_INCLUDE_MAP = {
     "curand_precalc.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
     "curand_uniform.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
     "cusparse.h": ("hipsparse.h", CONV_INCLUDE, API_RAND),
-    "#include <cufft.h>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
-    "#include <cufftXt.h>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
+    "cufft.h": ("hipfft.h", CONV_INCLUDE, API_BLAS),
+    "cufftXt.h": ("hipfft.h", CONV_INCLUDE, API_BLAS),
     "#include <nvfunctional>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
 }
 
@@ -2095,7 +2095,77 @@ CUDA_IDENTIFIER_MAP = {
     "curand_poisson": ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND),
     "curand_poisson4": ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND),
     "curand_Philox4x32_10": ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "mtgp32_kernel_params": ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND)
+    "mtgp32_kernel_params": ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND),
+    "CUFFT_FORWARD": ("HIPFFT_FORWARD", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUFFT_INVERSE": ("HIPFFT_BACKWARD", CONV_NUMERIC_LITERAL, API_BLAS),
+    "CUFFT_COMPATIBILITY_DEFAULT": ("HIPFFT_COMPATIBILITY_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+    "cufftResult_t": ("hipfftResult_t", CONV_TYPE, API_FFT),
+    "cufftResult": ("hipfftResult", CONV_TYPE, API_FFT),
+    "CUFFT_SUCCESS": ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INVALID_PLAN": ("HIPFFT_INVALID_PLAN", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_ALLOC_FAILED": ("HIPFFT_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INVALID_TYPE": ("HIPFFT_INVALID_TYPE", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INVALID_VALUE": ("HIPFFT_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INTERNAL_ERROR": ("HIPFFT_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_EXEC_FAILED": ("HIPFFT_EXEC_FAILED", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_SETUP_FAILED": ("HIPFFT_SETUP_FAILED", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INVALID_SIZE": ("HIPFFT_INVALID_SIZE", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_UNALIGNED_DATA": ("HIPFFT_UNALIGNED_DATA", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INCOMPLETE_PARAMETER_LIST": ("HIPFFT_INCOMPLETE_PARAMETER_LIST", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_INVALID_DEVICE": ("HIPFFT_INVALID_DEVICE", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_PARSE_ERROR": ("HIPFFT_PARSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_NO_WORKSPACE": ("HIPFFT_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_NOT_IMPLEMENTED": ("HIPFFT_NOT_IMPLEMENTED", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_LICENSE_ERROR": ("HIPFFT_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED),
+    "CUFFT_NOT_SUPPORTED": ("HIPFFT_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_FFT),
+    "cufftType_t": ("hipfftType_t", CONV_TYPE, API_FFT),
+    "cufftType": ("hipfftType", CONV_TYPE, API_FFT),
+    "CUFFT_R2C": ("HIPFFT_R2C", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_C2R": ("HIPFFT_C2R", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_C2C": ("HIPFFT_C2C", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_D2Z": ("HIPFFT_D2Z", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_Z2D": ("HIPFFT_Z2D", CONV_NUMERIC_LITERAL, API_FFT),
+    "CUFFT_Z2Z": ("HIPFFT_Z2Z", CONV_NUMERIC_LITERAL, API_FFT),
+    "cufftCompatibility_t": ("hipfftCompatibility_t", CONV_TYPE, API_FFT, HIP_UNSUPPORTED),
+    "cufftCompatibility": ("hipfftCompatibility", CONV_TYPE, API_FFT, HIP_UNSUPPORTED),
+    "CUFFT_COMPATIBILITY_FFTW_PADDING": ("HIPFFT_COMPATIBILITY_FFTW_PADDING", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED),
+    "cufftReal": ("hipfftReal", CONV_TYPE, API_FFT),
+    "cufftDoubleReal": ("hipfftDoubleReal", CONV_TYPE, API_FFT),
+    "cufftComplex": ("hipfftComplex", CONV_TYPE, API_FFT),
+    "cufftDoubleComplex": ("hipfftDoubleComplex", CONV_TYPE, API_FFT),
+    "cufftHandle": ("hipfftHandle", CONV_TYPE, API_FFT),
+    "cufftPlan1d": ("hipfftPlan1d", CONV_MATH_FUNC, API_FFT),
+    "cufftPlan2d": ("hipfftPlan2d", CONV_MATH_FUNC, API_FFT),
+    "cufftPlan3d": ("hipfftPlan3d", CONV_MATH_FUNC, API_FFT),
+    "cufftPlanMany": ("hipfftPlanMany", CONV_MATH_FUNC, API_FFT),
+    "cufftMakePlan1d": ("hipfftMakePlan1d", CONV_MATH_FUNC, API_FFT),
+    "cufftMakePlan2d": ("hipfftMakePlan2d", CONV_MATH_FUNC, API_FFT),
+    "cufftMakePlan3d": ("hipfftMakePlan3d", CONV_MATH_FUNC, API_FFT),
+    "cufftMakePlanMany": ("hipfftMakePlanMany", CONV_MATH_FUNC, API_FFT),
+    "cufftMakePlanMany64": ("hipfftMakePlanMany64", CONV_MATH_FUNC, API_FFT),
+    "cufftGetSizeMany64": ("hipfftGetSizeMany64", CONV_MATH_FUNC, API_FFT),
+    "cufftEstimate1d": ("hipfftEstimate1d", CONV_MATH_FUNC, API_FFT),
+    "cufftEstimate2d": ("hipfftEstimate2d", CONV_MATH_FUNC, API_FFT),
+    "cufftEstimate3d": ("hipfftEstimate3d", CONV_MATH_FUNC, API_FFT),
+    "cufftEstimateMany": ("hipfftEstimateMany", CONV_MATH_FUNC, API_FFT),
+    "cufftCreate": ("hipfftCreate", CONV_MATH_FUNC, API_FFT),
+    "cufftGetSize1d": ("hipfftGetSize1d", CONV_MATH_FUNC, API_FFT),
+    "cufftGetSize2d": ("hipfftGetSize2d", CONV_MATH_FUNC, API_FFT),
+    "cufftGetSize3d": ("hipfftGetSize3d", CONV_MATH_FUNC, API_FFT),
+    "cufftGetSizeMany": ("hipfftGetSizeMany", CONV_MATH_FUNC, API_FFT),
+    "cufftGetSize": ("hipfftGetSize", CONV_MATH_FUNC, API_FFT),
+    "cufftSetWorkArea": ("hipfftSetWorkArea", CONV_MATH_FUNC, API_FFT),
+    "cufftSetAutoAllocation": ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT),
+    "cufftExecC2C": ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT),
+    "cufftExecR2C": ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT),
+    "cufftExecC2R": ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT),
+    "cufftExecZ2Z": ("hipfftExecZ2Z", CONV_MATH_FUNC, API_FFT),
+    "cufftExecD2Z": ("hipfftExecD2Z", CONV_MATH_FUNC, API_FFT),
+    "cufftExecZ2D": ("hipfftExecZ2D", CONV_MATH_FUNC, API_FFT),
+    "cufftSetStream": ("hipfftSetStream", CONV_MATH_FUNC, API_FFT),
+    "cufftDestroy": ("hipfftDestroy", CONV_MATH_FUNC, API_FFT),
+    "cufftGetVersion": ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT),
+    "cufftGetProperty": ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED),
 }
 
 CUDA_SPARSE_MAP = {
diff --git a/tools/amd_build/pyHIPIFY/hipify-python.py b/tools/amd_build/pyHIPIFY/hipify-python.py
index 2e2c44d4c36..8fec20a1aae 100755
--- a/tools/amd_build/pyHIPIFY/hipify-python.py
+++ b/tools/amd_build/pyHIPIFY/hipify-python.py
@@ -760,8 +760,8 @@ def preprocessor(filepath, stats, hipify_caffe2):
         output_source = processKernelLaunches(output_source, stats)
 
         # Disable asserts
-        if not filepath.endswith("THCGeneral.h.in"):
-            output_source = disable_asserts(output_source)
+        # if not filepath.endswith("THCGeneral.h.in"):
+        #    output_source = disable_asserts(output_source)
 
         # Replace std:: with non-std:: versions
         output_source = replace_math_functions(output_source)