mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57034 Resolves gh-38915 For the example given in the issue, BatchNorm1d on cuDNN is around 12x slower than BatchNorm2d. Internally, cuDNN expects at least a 4d tensor (N, C, H, W) so these two modules actually call the same cuDNN code. My assumption is that cuDNN just isn't optimized for H=W=1. Instead, this disables cudnn for 2d batch_norm inputs and improves the CUDA implementation of `native_batch_norm` to be competative with cuDNN. For the example in the issue, `BatchNorm1d` now takes 335 us compared to 6.3 ms before, or a 18x speedup. Before this change, nvprof shows: ``` Type Time(%) Time Calls Avg Min Max Name GPU activities: 99.64% 630.95ms 100 6.3095ms 5.6427ms 8.8800ms void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=0, int=2>(cudnnTensorStruct, float const *, cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, int=512, bool=0, int=2>, cudnnTensorStruct*, float const *, float const , cudnnTensorStruct*, cudnnTensorStruct*, cudnnTensorStruct**, float const *, float const *, float const *, cudnnTensorStruct*, cudnnTensorStruct*) ``` But after, it shows: ``` Type Time(%) Time Calls Avg Min Max Name GPU activities: 54.76% 14.352ms 100 143.52us 123.52us 756.28us _ZN2at6native27unrolled_elementwise_kernelIZZZNS0_72_GLOBAL__N__48_tmpxft_001e82d0_00000000_7_Normalization_cpp1_ii_db66e07022batch_norm_elementwiseERKNS_6TensorES5_RKN3c108optionalIS3_EESA_S5_S5_ENKUlvE_clEvENKUlvE2_clEvEUlfffffE_NS_6detail5ArrayIPcLi6EEE16OffsetCalculatorILi5EjESI_ILi1EjENS0_6memory15LoadWithoutCastENSL_16StoreWithoutCastEEEviT_T0_T1_T2_T3_T4_ 35.09% 9.1951ms 100 91.950us 84.415us 362.17us void at::native::reduce_kernel<int=256, int=2, at::native::ReduceOp<float, at::native::WelfordOps<float, float, int, float, thrust::pair<float, float>>, unsigned int, float, int=2>>(float) 0.71% 186.14us 100 1.8610us 1.8240us 1.9840us _ZN2at6native72_GLOBAL__N__48_tmpxft_001e82d0_00000000_7_Normalization_cpp1_ii_db66e07045unrolled_elementwise_kernel_for_multi_outputsILi3EZZZNS1_34batch_norm_update_stats_and_invertERKNS_6TensorES5_S5_S5_ddlENKUlvE_clEvENKUlvE2_clEvEUlffffE_NS_6detail5ArrayIPcLi7EEE23TrivialOffsetCalculatorILi4EjESD_ILi3EjEEEviT0_T1_T2_T3_ 0.59% 153.37us 100 1.5330us 1.4720us 2.6240us void at::native::vectorized_elementwise_kernel<int=4, at::native::BUnaryFunctor<at::native::AddFunctor<long>>, at::detail::Array<char*, int=2>>(int, long, at::native::AddFunctor<long>) ``` I think there is similar scope to improve the backward implementation. Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D28142447 Pulled By: ngimel fbshipit-source-id: c70109780e206fa85e50a31e90a1cb4c533199da
436 lines
15 KiB
C++
436 lines
15 KiB
C++
#ifndef C10_MACROS_MACROS_H_
|
|
#define C10_MACROS_MACROS_H_
|
|
|
|
/* Main entry for c10/macros.
|
|
*
|
|
* In your code, include c10/macros/Macros.h directly, instead of individual
|
|
* files in this folder.
|
|
*/
|
|
|
|
// For build systems that do not directly depend on CMake and directly build
|
|
// from the source directory (such as Buck), one may not have a cmake_macros.h
|
|
// file at all. In this case, the build system is responsible for providing
|
|
// correct macro definitions corresponding to the cmake_macros.h.in file.
|
|
//
|
|
// In such scenarios, one should define the macro
|
|
// C10_USING_CUSTOM_GENERATED_MACROS
|
|
// to inform this header that it does not need to include the cmake_macros.h
|
|
// file.
|
|
|
|
#ifndef C10_USING_CUSTOM_GENERATED_MACROS
|
|
#include <c10/macros/cmake_macros.h>
|
|
#endif // C10_USING_CUSTOM_GENERATED_MACROS
|
|
|
|
#include <c10/macros/Export.h>
|
|
|
|
#if defined(__clang__)
|
|
#define __ubsan_ignore_float_divide_by_zero__ \
|
|
__attribute__((no_sanitize("float-divide-by-zero")))
|
|
#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined")))
|
|
#define __ubsan_ignore_signed_int_overflow__ \
|
|
__attribute__((no_sanitize("signed-integer-overflow")))
|
|
#else
|
|
#define __ubsan_ignore_float_divide_by_zero__
|
|
#define __ubsan_ignore_undefined__
|
|
#define __ubsan_ignore_signed_int_overflow__
|
|
#endif
|
|
|
|
// Detect address sanitizer as some stuff doesn't work with it
|
|
#undef C10_ASAN_ENABLED
|
|
|
|
// for clang
|
|
#if defined(__has_feature)
|
|
#if ((__has_feature(address_sanitizer)))
|
|
#define C10_ASAN_ENABLED 1
|
|
#endif
|
|
#endif
|
|
|
|
// for gcc
|
|
#if defined(__SANITIZE_ADDRESS__)
|
|
#if __SANITIZE_ADDRESS__
|
|
#if !defined(C10_ASAN_ENABLED)
|
|
#define C10_ASAN_ENABLED 1
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(C10_ASAN_ENABLED)
|
|
#define C10_ASAN_ENABLED 0
|
|
#endif
|
|
|
|
// Disable the copy and assignment operator for a class. Note that this will
|
|
// disable the usage of the class in std containers.
|
|
#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
|
|
classname(const classname&) = delete; \
|
|
classname& operator=(const classname&) = delete
|
|
|
|
#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
|
|
#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
|
|
|
|
#define C10_MACRO_EXPAND(args) args
|
|
|
|
#define C10_STRINGIZE_IMPL(x) #x
|
|
#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x)
|
|
|
|
/**
|
|
* C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
|
|
* str and ending with a number that varies with the line.
|
|
*/
|
|
#ifdef __COUNTER__
|
|
#define C10_UID __COUNTER__
|
|
#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
|
|
#else
|
|
#define C10_UID __LINE__
|
|
#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
|
|
#endif
|
|
|
|
#ifdef __has_attribute
|
|
#define C10_HAS_ATTRIBUTE(x) __has_attribute(x)
|
|
#else
|
|
#define C10_HAS_ATTRIBUTE(x) (0)
|
|
#endif
|
|
|
|
#ifdef __has_cpp_attribute
|
|
#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
|
|
#else
|
|
#define C10_HAS_CPP_ATTRIBUTE(x) (0)
|
|
#endif
|
|
|
|
/// C10_NODISCARD - Warn if a type or return value is discarded.
|
|
|
|
// Technically, we should check if __cplusplus > 201402L here, because
|
|
// [[nodiscard]] is only defined in C++17. However, some compilers
|
|
// we care about don't advertise being C++17 (e.g., clang), but
|
|
// support the attribute anyway. In fact, this is not just a good idea,
|
|
// it's the law: clang::warn_unused_result doesn't work on nvcc + clang
|
|
// and the best workaround for this case is to use [[nodiscard]]
|
|
// instead; see https://github.com/pytorch/pytorch/issues/13118
|
|
//
|
|
// Note to future editors: if you have noticed that a compiler is
|
|
// misbehaving (e.g., it advertises support, but the support doesn't
|
|
// actually work, or it is emitting warnings). Some compilers which
|
|
// are strict about the matter include MSVC, which will complain:
|
|
//
|
|
// error C2429: attribute 'nodiscard' requires compiler flag '/std:c++latest'
|
|
//
|
|
// Exhibits:
|
|
// - MSVC 19.14: https://godbolt.org/z/Dzd7gn (requires /std:c++latest)
|
|
// - Clang 8.0.0: https://godbolt.org/z/3PYL4Z (always advertises support)
|
|
// - gcc 8.3: https://godbolt.org/z/4tLMQS (always advertises support)
|
|
#define C10_NODISCARD
|
|
#if defined(__has_cpp_attribute)
|
|
#if __has_cpp_attribute(nodiscard)
|
|
#undef C10_NODISCARD
|
|
#define C10_NODISCARD [[nodiscard]]
|
|
#endif
|
|
// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
|
|
// error when __has_cpp_attribute is given a scoped attribute in C mode.
|
|
#elif __cplusplus && defined(__has_cpp_attribute)
|
|
#if __has_cpp_attribute(clang::warn_unused_result)
|
|
// TODO: It's possible this is still triggering
|
|
// https://github.com/pytorch/pytorch/issues/13118 on Windows; if it is, better
|
|
// fix it.
|
|
#undef C10_NODISCARD
|
|
#define C10_NODISCARD [[clang::warn_unused_result]]
|
|
#endif
|
|
#endif
|
|
|
|
// suppress an unused variable.
|
|
#if defined(_MSC_VER) && !defined(__clang__)
|
|
#define C10_UNUSED __pragma(warning(suppress : 4100 4101))
|
|
#else
|
|
#define C10_UNUSED __attribute__((__unused__))
|
|
#endif //_MSC_VER
|
|
|
|
#define C10_RESTRICT __restrict
|
|
|
|
// Simply define the namespace, in case a dependent library want to refer to
|
|
// the c10 namespace but not any nontrivial files.
|
|
namespace c10 {} // namespace c10
|
|
namespace c10 {
|
|
namespace cuda {}
|
|
} // namespace c10
|
|
namespace c10 {
|
|
namespace hip {}
|
|
} // namespace c10
|
|
|
|
// Since C10 is the core library for caffe2 (and aten), we will simply reroute
|
|
// all abstractions defined in c10 to be available in caffe2 as well.
|
|
// This is only for backwards compatibility. Please use the symbols from the
|
|
// c10 namespace where possible.
|
|
namespace caffe2 {
|
|
using namespace c10;
|
|
}
|
|
namespace at {
|
|
using namespace c10;
|
|
}
|
|
namespace at {
|
|
namespace cuda {
|
|
using namespace c10::cuda;
|
|
}
|
|
} // namespace at
|
|
|
|
// WARNING!!! THIS IS A GIANT HACK!!!
|
|
// This line means you cannot simultaneously include c10/hip
|
|
// and c10/cuda and then use them from the at::cuda namespace.
|
|
// This is true in practice, because HIPIFY works inplace on
|
|
// files in ATen/cuda, so it assumes that c10::hip is available
|
|
// from at::cuda. This namespace makes that happen. When
|
|
// HIPIFY is no longer out-of-place, we can switch the cuda
|
|
// here to hip and everyone is happy.
|
|
namespace at {
|
|
namespace cuda {
|
|
using namespace c10::hip;
|
|
}
|
|
} // namespace at
|
|
|
|
// C10_LIKELY/C10_UNLIKELY
|
|
//
|
|
// These macros provide parentheses, so you can use these macros as:
|
|
//
|
|
// if C10_LIKELY(some_expr) {
|
|
// ...
|
|
// }
|
|
//
|
|
// NB: static_cast to boolean is mandatory in C++, because __builtin_expect
|
|
// takes a long argument, which means you may trigger the wrong conversion
|
|
// without it.
|
|
//
|
|
#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
|
|
#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
|
|
#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
|
|
#else
|
|
#define C10_LIKELY(expr) (expr)
|
|
#define C10_UNLIKELY(expr) (expr)
|
|
#endif
|
|
|
|
/// C10_NOINLINE - Functions whose declaration is annotated with this will not
|
|
/// be inlined.
|
|
#ifdef __GNUC__
|
|
#define C10_NOINLINE __attribute__((noinline))
|
|
#elif _MSC_VER
|
|
#define C10_NOINLINE __declspec(noinline)
|
|
#else
|
|
#define C10_NOINLINE
|
|
#endif
|
|
|
|
#if defined(_MSC_VER)
|
|
#define C10_ALWAYS_INLINE __forceinline
|
|
#elif __has_attribute(always_inline) || defined(__GNUC__)
|
|
#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
|
|
#else
|
|
#define C10_ALWAYS_INLINE inline
|
|
#endif
|
|
|
|
#if C10_HAS_CPP_ATTRIBUTE(fallthrough)
|
|
#define C10_FALLTHROUGH [[fallthrough]]
|
|
#elif C10_HAS_ATTRIBUTE(fallthrough)
|
|
#define C10_FALLTHROUGH __attribute__((fallthrough))
|
|
#else
|
|
#define C10_FALLTHROUGH
|
|
#endif
|
|
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
#ifdef __HIPCC__
|
|
// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
|
|
// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
|
|
// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
|
|
#include <hip/hip_runtime.h>
|
|
#endif
|
|
|
|
#if defined(__CUDACC__) || defined(__HIPCC__)
|
|
// Designates functions callable from the host (CPU) and the device (GPU)
|
|
#define C10_HOST_DEVICE __host__ __device__
|
|
#define C10_DEVICE __device__
|
|
#define C10_HOST __host__
|
|
// constants from
|
|
// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
|
|
// The maximum number of threads per multiprocessor is 1024 for Turing
|
|
// architecture (7.5), 1536 for Geforce Ampere (8.6), and 2048 for all other
|
|
// architectures. You'll get warnings if you exceed these constants. Hence, the
|
|
// following macros adjust the input values from the user to resolve potential
|
|
// warnings.
|
|
#if __CUDA_ARCH__ == 750
|
|
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
|
|
#elif __CUDA_ARCH__ == 860
|
|
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
|
|
#else
|
|
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
|
|
#endif
|
|
// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
|
|
constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
|
|
// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block
|
|
// size. 256 is a good number for this fallback and should give good occupancy
|
|
// and versatility across all architectures.
|
|
constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
|
|
// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
|
|
// turns out that although __launch_bounds__ can take constexpr, it
|
|
// can't take a constexpr that has anything to do with templates.
|
|
// Currently we use launch_bounds that depend on template arguments in
|
|
// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK
|
|
// and C10_MIN_BLOCKS_PER_SM are kept as macros.
|
|
// Suppose you were planning to write __launch_bounds__(a, b), based on your
|
|
// performance tuning on a modern GPU. Instead, you should write
|
|
// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
|
|
// which will also properly respect limits on old architectures.
|
|
#define C10_MAX_THREADS_PER_BLOCK(val) \
|
|
(((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \
|
|
: CUDA_THREADS_PER_BLOCK_FALLBACK)
|
|
#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \
|
|
((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
|
|
? (blocks_per_sm) \
|
|
: ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) / \
|
|
(threads_per_block))))
|
|
// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
|
|
#define C10_LAUNCH_BOUNDS_0 \
|
|
__launch_bounds__( \
|
|
256, 4) // default launch bounds that should give good occupancy and
|
|
// versatility across all architectures.
|
|
#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \
|
|
__launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
|
|
#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \
|
|
__launch_bounds__( \
|
|
(C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \
|
|
(C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
|
|
#else
|
|
#define C10_HOST_DEVICE
|
|
#define C10_HOST
|
|
#define C10_DEVICE
|
|
#endif
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
#define C10_HIP_HOST_DEVICE __host__ __device__
|
|
#else
|
|
#define C10_HIP_HOST_DEVICE
|
|
#endif
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
#define C10_WARP_SIZE 64
|
|
#else
|
|
#define C10_WARP_SIZE 32
|
|
#endif
|
|
|
|
#if defined(_MSC_VER) && _MSC_VER <= 1900
|
|
#define __func__ __FUNCTION__
|
|
#endif
|
|
|
|
// CUDA_KERNEL_ASSERT checks the assertion
|
|
// even when NDEBUG is defined. This is useful for important assertions in CUDA
|
|
// code that would otherwise be suppressed when building Release.
|
|
#if defined(__ANDROID__) || defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
|
|
// Those platforms do not support assert()
|
|
#define CUDA_KERNEL_ASSERT(cond)
|
|
#elif defined(_MSC_VER)
|
|
#if defined(NDEBUG)
|
|
extern "C" {
|
|
C10_IMPORT
|
|
#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__)
|
|
__host__ __device__
|
|
#endif // __CUDA_ARCH__
|
|
void
|
|
_wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
|
|
}
|
|
#endif
|
|
#define CUDA_KERNEL_ASSERT(cond) \
|
|
if (C10_UNLIKELY(!(cond))) { \
|
|
(void)(_wassert(_CRT_WIDE(#cond), _CRT_WIDE(__FILE__), static_cast<unsigned>(__LINE__)), 0); \
|
|
}
|
|
#else // __APPLE__, _MSC_VER
|
|
#if defined(NDEBUG)
|
|
extern "C" {
|
|
#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) || \
|
|
defined(__HIP_ARCH__) || defined(__HIP__)
|
|
__host__ __device__
|
|
#endif // __CUDA_ARCH__
|
|
void
|
|
__assert_fail(
|
|
const char* assertion,
|
|
const char* file,
|
|
unsigned int line,
|
|
const char* function) throw();
|
|
}
|
|
#endif // NDEBUG
|
|
#define CUDA_KERNEL_ASSERT(cond) \
|
|
if (C10_UNLIKELY(!(cond))) { \
|
|
__assert_fail( \
|
|
#cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
|
|
}
|
|
#endif // __APPLE__
|
|
|
|
#ifdef __APPLE__
|
|
#include <TargetConditionals.h>
|
|
#endif
|
|
|
|
#if defined(__ANDROID__)
|
|
#define C10_ANDROID 1
|
|
#define C10_MOBILE 1
|
|
#elif ( \
|
|
defined(__APPLE__) && \
|
|
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
|
|
#define C10_IOS 1
|
|
#define C10_MOBILE 1
|
|
#endif // ANDROID / IOS
|
|
|
|
// Portable determination of whether type T is trivially copyable.
|
|
// Warning: __has_trivial_copy for GCC may not always detect the non-POD
|
|
// correctly. For example, T = std::unique_ptr may evaluate to true and be
|
|
// treated as POD. This can cause unexpected behavior.
|
|
#if defined(__GNUG__) && __GNUC__ < 5
|
|
#define C10_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
|
|
#else
|
|
#define C10_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
|
|
#endif
|
|
|
|
// We need --expt-relaxed-constexpr in CUDA because of Eigen. This flag allows
|
|
// device code in CUDA to call host constexpr functions. Unfortunately,
|
|
// the CUDA compiler (at least for CUDA 9.0, 9.1 and 9.2) isn't compatible
|
|
// with many of the constexpr things we'd like to do and the device code
|
|
// compiler crashes when it sees one of these host-only functions.
|
|
// It works when nvcc builds host code, but not when it builds device code
|
|
// and notices it can call these constexpr functions from device code.
|
|
// As a workaround, we use C10_HOST_CONSTEXPR instead of constexpr for these
|
|
// functions. This enables constexpr when compiled on the host and applies
|
|
// __host__ when it is compiled on the device in an attempt to stop it from
|
|
// being called from device functions. Not sure if the latter works, but
|
|
// even if not, it not being constexpr anymore should be enough to stop
|
|
// it from being called from device code.
|
|
// TODO This occurred in CUDA 9 (9.0 to 9.2). Test if this is fixed in CUDA 10.
|
|
#if defined(__CUDA_ARCH__)
|
|
#define C10_HOST_CONSTEXPR __host__
|
|
#define C10_HOST_CONSTEXPR_VAR
|
|
#else
|
|
#define C10_HOST_CONSTEXPR constexpr
|
|
#define C10_HOST_CONSTEXPR_VAR constexpr
|
|
#endif
|
|
|
|
#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
|
|
__GNUC__ < 6
|
|
#define CONSTEXPR_EXCEPT_GCC5
|
|
#define IS_NOT_GCC5_CONSTEXPR 0
|
|
#else
|
|
#define CONSTEXPR_EXCEPT_GCC5 constexpr
|
|
#define IS_NOT_GCC5_CONSTEXPR 1
|
|
#endif
|
|
|
|
#if defined(__CUDA_ARCH__)
|
|
#if defined(_MSC_VER) && defined(__CUDACC__)
|
|
#define CONSTEXPR_EXCEPT_WIN_CUDA const
|
|
#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA __host__
|
|
#else
|
|
#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
|
|
#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA __host__
|
|
#endif
|
|
#else
|
|
#if defined(_MSC_VER) && defined(__CUDACC__)
|
|
#define CONSTEXPR_EXCEPT_WIN_CUDA const
|
|
#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA
|
|
#else
|
|
#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
|
|
#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
|
|
#endif
|
|
#endif
|
|
|
|
#endif // C10_MACROS_MACROS_H_
|