mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Revert "Remove outdated CUDA 11 conditions (#154313)"
This reverts commit 3936e6141c.
Reverted https://github.com/pytorch/pytorch/pull/154313 on behalf of https://github.com/izaitsevfb due to breaks internal builds ([comment](https://github.com/pytorch/pytorch/pull/154313#issuecomment-2914230005))
This commit is contained in:
parent
6be829535f
commit
241f8dc84d
|
|
@ -78,7 +78,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
|
|||
return CUDA_R_64I;
|
||||
case c10::ScalarType::BFloat16:
|
||||
return CUDA_R_16BF;
|
||||
#if defined(CUDA_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
|
||||
case c10::ScalarType::Float8_e4m3fn:
|
||||
return CUDA_R_8F_E4M3;
|
||||
case c10::ScalarType::Float8_e5m2:
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ void CUDAGraph::capture_end() {
|
|||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
|
||||
// cudaGraphInstantiateWithFlags
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
|
||||
#if (defined(CUDA_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
|
||||
#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
|
||||
int version = 0;
|
||||
AT_CUDA_CHECK(cudaDriverGetVersion(&version));
|
||||
if (version < 11040) {
|
||||
|
|
@ -154,7 +154,7 @@ void CUDAGraph::capture_end() {
|
|||
#endif
|
||||
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
|
||||
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
|
||||
#if (defined(CUDA_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
|
||||
#if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
|
||||
} else {
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
|
|
@ -216,7 +216,7 @@ void CUDAGraph::enable_debug_mode() {
|
|||
}
|
||||
|
||||
void CUDAGraph::debug_dump(const std::string& debug_path) {
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM)
|
||||
if (_cuda_graphs_debug) {
|
||||
TORCH_WARN("DEBUG: calling debug_dump()");
|
||||
if (has_graph_) {
|
||||
|
|
|
|||
|
|
@ -117,11 +117,15 @@ constexpr const char* _cusolver_backend_suggestion = \
|
|||
"linear algebra operators with other supported backends. " \
|
||||
"See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
|
||||
|
||||
// When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
|
||||
// When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
|
||||
#define TORCH_CUSOLVER_CHECK(EXPR) \
|
||||
do { \
|
||||
cusolverStatus_t __err = EXPR; \
|
||||
if (__err == CUSOLVER_STATUS_INVALID_VALUE) { \
|
||||
if ((CUDA_VERSION < 11500 && \
|
||||
__err == CUSOLVER_STATUS_EXECUTION_FAILED) || \
|
||||
(CUDA_VERSION >= 11500 && \
|
||||
__err == CUSOLVER_STATUS_INVALID_VALUE)) { \
|
||||
TORCH_CHECK_LINALG( \
|
||||
false, \
|
||||
"cusolver error: ", \
|
||||
|
|
|
|||
|
|
@ -291,7 +291,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
|||
#endif
|
||||
}
|
||||
|
||||
# if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
|
||||
|
||||
template<typename T>
|
||||
struct BlockPrefixCallbackOp
|
||||
|
|
|
|||
|
|
@ -146,8 +146,10 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
|||
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
|
||||
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
|
||||
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
|
||||
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
|
||||
#endif
|
||||
NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
|
||||
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
|
||||
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ namespace at::cuda {
|
|||
AT_FORALL_NVRTC_BASE(_)
|
||||
#endif
|
||||
|
||||
#if defined(CUDA_VERSION)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
#define AT_FORALL_NVRTC(_) \
|
||||
AT_FORALL_NVRTC_EXTENDED(_) \
|
||||
_(nvrtcGetCUBINSize) \
|
||||
|
|
|
|||
|
|
@ -359,7 +359,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
|||
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
|
||||
c10::MaybeOwned<Tensor> self_;
|
||||
if (&result != &self) {
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
|
||||
// Strangely, if mat2 has only 1 row or column, we get
|
||||
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
|
||||
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
|
||||
|
|
@ -495,6 +495,15 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
|||
}
|
||||
#else
|
||||
auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
|
||||
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
|
||||
// GELU is not supported (and does not compile!) prior
|
||||
// to CUDA 11.4. Have observed accuracy issues with
|
||||
// GELU epilogue in 11.4; disabling the GELU epilogue
|
||||
// path for CUDA version < 11.8.
|
||||
if (activation == Activation::GELU)
|
||||
activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None;
|
||||
#endif
|
||||
|
||||
bool okay = true;
|
||||
if (is_float_output_with_half_input) {
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(
|
||||
|
|
@ -637,7 +646,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
|||
// gating activation_to_gemm_and_blas_arg above; here we are manually
|
||||
// performing a post-GELU because we weren't able to use the GELU
|
||||
// epilogue above.
|
||||
#if !defined(CUDA_VERSION) && !defined(USE_ROCM)
|
||||
#if !(defined(CUDA_VERSION) && CUDA_VERSION >= 11080) && !defined(USE_ROCM)
|
||||
if (useLtInterface && activation == Activation::GELU) {
|
||||
at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
|
||||
}
|
||||
|
|
@ -1008,7 +1017,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
|
|||
|
||||
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
|
||||
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || defined(USE_ROCM)
|
||||
cublasCommonArgs args(self, mat2, result);
|
||||
|
||||
at::cuda::blas::int8_gemm(
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cuda/CUDAUtils.h>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
// Doesn't work on ROCm or Windows yet
|
||||
// TODO: Add compiler warning? Add PyTorch config flag?
|
||||
#else
|
||||
|
|
@ -20,7 +20,7 @@
|
|||
#include <ATen/native/cuda/cutlass_extensions/gemm/threadblock/default_mma.h>
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
// Doesn't work on ROCm or Windows yet
|
||||
#else
|
||||
#define CUTLASS_STATUS_CHECK(status) \
|
||||
|
|
@ -32,7 +32,7 @@
|
|||
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
// Doesn't work on ROCm or Windows yet or old compiler
|
||||
#else
|
||||
template<typename ElementInputA, typename ElementInputB, typename EpilogueTag>
|
||||
|
|
@ -198,7 +198,7 @@ _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
|
|||
const Tensor& scale,
|
||||
const std::optional<Tensor>& bias_opt,
|
||||
const std::optional<std::string_view> activation_opt) {
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, "_mixed_dtypes_linear: not compiled for this platform");
|
||||
return Tensor{};
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -300,7 +300,7 @@ void nonzero_static_cuda_out_impl(
|
|||
int64_t size,
|
||||
int64_t fill_value,
|
||||
Tensor& out) {
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
|
||||
|
||||
Tensor self_contiguous_ = self.contiguous();
|
||||
// see comment in nonzero_cuda_out_impl on reqs for out
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ void addcmul_cuda_scalar_tensor2_kernel(
|
|||
const Scalar& value
|
||||
);
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
|
||||
constexpr char addcmul_name[] = "addcmul";
|
||||
#endif
|
||||
void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
|
||||
|
|
@ -37,7 +37,10 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
|
|||
|
||||
auto dtype = iter.common_dtype();
|
||||
if (at::isComplexType(dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// When using Jiterator, addcmul and addcdiv kernels get stuck during a
|
||||
// promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
|
||||
// https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
|
||||
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
|
||||
AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
|
||||
auto alpha = value.to<scalar_t>();
|
||||
static const auto addcmul_string = jiterator_stringify(
|
||||
|
|
@ -90,14 +93,17 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
|
|||
}
|
||||
}
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
|
||||
constexpr char addcmul_scalar_tensor2_name[] = "addcmul_scalar_tensor2";
|
||||
#endif
|
||||
void addcmul_cuda_scalar_tensor2_kernel(TensorIteratorBase& iter, const Scalar& scalar_tensor2, const Scalar& value) {
|
||||
auto dtype = iter.common_dtype();
|
||||
|
||||
if (at::isComplexType(dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// When using Jiterator, addcmul and addcdiv kernels get stuck during a
|
||||
// promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
|
||||
// https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
|
||||
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
|
||||
AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
|
||||
auto c = scalar_tensor2.to<scalar_t>();
|
||||
auto alpha = value.to<scalar_t>();
|
||||
|
|
@ -139,14 +145,17 @@ void addcmul_cuda_scalar_tensor2_kernel(TensorIteratorBase& iter, const Scalar&
|
|||
}
|
||||
}
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
|
||||
// return a + alpha * (b / static_cast<accscalar_t>(c));
|
||||
constexpr char addcdiv_name[] = "addcdiv";
|
||||
#endif
|
||||
void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
|
||||
auto dtype = iter.common_dtype();
|
||||
if (at::isComplexType(dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// When using Jiterator, addcmul and addcdiv kernels get stuck during a
|
||||
// promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
|
||||
// https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
|
||||
#if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
|
||||
AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() {
|
||||
auto alpha = value.to<scalar_t>();
|
||||
static const auto addcdiv_string =
|
||||
|
|
|
|||
|
|
@ -453,7 +453,7 @@ void scan_dim(const TensorBase& self, const TensorBase& result,
|
|||
if (self.numel() == self.size(dim)) {
|
||||
if constexpr (std::is_same_v<BinaryFunction, std::plus<scalar_t>>) {
|
||||
if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms()) && (self.is_floating_point() || self.is_complex())) {
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
# if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
|
||||
cuda::cub::inclusive_deterministic_scan(self_->const_data_ptr<scalar_t>(), result.mutable_data_ptr<scalar_t>(), binary_op, self.numel());
|
||||
#else
|
||||
globalContext().alertNotDeterministic("cumsum_cuda_kernel");
|
||||
|
|
|
|||
|
|
@ -193,7 +193,9 @@ __device__ inline void bitonicSortKeys(
|
|||
// dimension as the innermost dim, such that we can get the particular slice for
|
||||
// a Tensor via its linear block dimension * the slice size.
|
||||
template <typename T, unsigned int Power2Size>
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11070
|
||||
__launch_bounds__(1024, 1)
|
||||
#endif
|
||||
__global__ void compute_mode(
|
||||
const T* input,
|
||||
at::cuda::detail::TensorInfo<T, unsigned int> values,
|
||||
|
|
|
|||
|
|
@ -912,6 +912,10 @@ void codegenOutputQuery(
|
|||
compile_to_sass = true;
|
||||
}
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION < 11010
|
||||
// compile to sass is not allowed prior to CUDA 11.1
|
||||
compile_to_sass = false;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1620,7 +1624,7 @@ NvrtcFunction jit_pwise_function(
|
|||
|
||||
size_t ptx_size = 0;
|
||||
std::vector<char> ptx;
|
||||
#if !defined(USE_ROCM)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
// compile_to_sass determines whether we are generating SASS or PTX, hence
|
||||
// the different API.
|
||||
const auto getSize = compile_to_sass
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#include <ATen/autocast_mode.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#include <ATen/native/sparse/cuda/ComputeSparseTile.h>
|
||||
#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
|
||||
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
struct Params {
|
||||
uint64_t const* threads_masks;
|
||||
|
|
@ -123,7 +123,7 @@ Tensor _sparse_semi_structured_apply_dense(
|
|||
const Tensor& input,
|
||||
const Tensor& threads_masks) {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, "_sparse_semi_structured_apply_dense: not supported");
|
||||
return Tensor{};
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cutlass/cutlass.h>
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
#include <type_traits>
|
||||
#include <tuple>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#define CUTLASS_STATUS_CHECK(status) \
|
||||
{ \
|
||||
|
|
@ -31,7 +31,7 @@ namespace {
|
|||
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
// Wrapper function for CUTLASS sparse GEMM implementation, used
|
||||
// solely to simplify dispatching from
|
||||
|
|
@ -613,7 +613,7 @@ Tensor _sparse_semi_structured_linear(
|
|||
"removed in a future PyTorch release. Please use "
|
||||
"_sparse_semi_structured_mm/_sparse_semi_structured_addmm "
|
||||
"instead.");
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, "_sparse_semi_structured_linear: CUTLASS not supported");
|
||||
return Tensor{};
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cutlass/cutlass.h>
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
#include <type_traits>
|
||||
#include <tuple>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#define CUTLASS_STATUS_CHECK(status) \
|
||||
{ \
|
||||
|
|
@ -28,7 +28,7 @@
|
|||
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
// Wrapper function for CUTLASS sparse GEMM implementation, used
|
||||
// solely to simplify dispatching from
|
||||
|
|
@ -526,7 +526,7 @@ Tensor sparse_semi_structured_mad_op(
|
|||
const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2,
|
||||
const std::optional<Tensor>& input_opt, const Scalar& alpha,
|
||||
const Scalar& beta, const std::optional<c10::ScalarType> out_dtype_opt) {
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, __func__, " : CUTLASS not supported");
|
||||
return Tensor{};
|
||||
#else
|
||||
|
|
@ -818,7 +818,7 @@ Tensor _sparse_semi_structured_addmm(
|
|||
// Following is just for testing purposes.
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
// Copied from tools/util/include/host_reorder.h, from CUTLASS source
|
||||
// tree. This is for simplicity - namely, this file is not under
|
||||
|
|
@ -856,7 +856,7 @@ static void reorder_meta(cutlass::TensorRef<Element, LayoutDest> dest,
|
|||
|
||||
std::tuple<Tensor, Tensor>
|
||||
_to_sparse_semi_structured(const Tensor& dense) {
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, __func__, " : CUTLASS not supported");
|
||||
return std::make_tuple(Tensor{}, Tensor{});
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
#include <ATen/cuda/CUDAUtils.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#include <ATen/native/sparse/cuda/ComputeSparseTile.h>
|
||||
#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
|
||||
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
struct MetadataCuSparseLt {
|
||||
// Format used by cuSparseLt
|
||||
|
|
@ -280,7 +280,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _sparse_semi_structured_tile(
|
|||
std::string_view algorithm,
|
||||
bool use_cutlass)
|
||||
{
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, "_sparse_semi_structured_tile: not supported");
|
||||
return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{});
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -5,14 +5,14 @@
|
|||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
#else
|
||||
template <typename KT>
|
||||
__global__ void __launch_bounds__(32 /* num_threads */)
|
||||
|
|
@ -89,7 +89,7 @@ std::tuple<Tensor, Tensor> _sparse_semi_structured_apply_typed(Tensor input, Ten
|
|||
|
||||
std::tuple<Tensor, Tensor> _sparse_semi_structured_apply(const Tensor& input, const Tensor& threads_masks) // Returned by `_sparse_semi_structured_tile`
|
||||
{
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER)
|
||||
#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
|
||||
TORCH_CHECK(false, "_sparse_semi_structured_apply: not supported");
|
||||
return std::make_tuple(Tensor{}, Tensor{});
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ FusedKernelCUDA::FusedKernelCUDA(
|
|||
args.push_back("-hip-pch");
|
||||
#else
|
||||
const std::string compute = std::string("--gpu-architecture=") +
|
||||
#if defined(CUDA_VERSION)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
// CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
|
||||
// which gives better backwards compatibility to work on older driver,
|
||||
// (since older driver doesn't necessrily recognize PTX emitted by new
|
||||
|
|
@ -156,7 +156,7 @@ FusedKernelCUDA::FusedKernelCUDA(
|
|||
[&] { AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcDestroyProgram(&program)); });
|
||||
AT_CUDA_NVRTC_CHECK(result);
|
||||
size_t ptx_size = 0;
|
||||
#if !defined(USE_ROCM)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
// compile_to_sass determines whether we are generating SASS or PTX, hence
|
||||
// the different API.
|
||||
const auto getSize = compile_to_sass
|
||||
|
|
|
|||
|
|
@ -1286,7 +1286,7 @@ void CudaCodeGen::CompileToNVRTC(
|
|||
args.push_back("-hip-pch");
|
||||
#else
|
||||
const std::string compute = std::string("--gpu-architecture=") +
|
||||
#if defined(CUDA_VERSION)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
// CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
|
||||
// which gives better backwards compatibility to work on older driver,
|
||||
// (since older driver doesn't necessarily recognize PTX emitted by new
|
||||
|
|
@ -1321,7 +1321,7 @@ void CudaCodeGen::CompileToNVRTC(
|
|||
AT_CUDA_NVRTC_CHECK(result);
|
||||
size_t ptx_size = 0;
|
||||
std::vector<char> ptx;
|
||||
#if !defined(USE_ROCM)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
// compile_to_sass determines whether we are generating SASS or PTX, hence
|
||||
// the different API.
|
||||
auto getSize = compile_to_sass
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user