mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[Reland] [5/N] Change static functions in headers to inline (#131010)
Reland of #130673 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131010 Approved by: https://github.com/Skylion007
This commit is contained in:
parent
d6ae8bbf16
commit
7c90a82970
|
|
@ -432,73 +432,73 @@ class TORCH_API Context {
|
|||
|
||||
TORCH_API Context& globalContext();
|
||||
|
||||
static inline void init() {
|
||||
inline void init() {
|
||||
globalContext();
|
||||
}
|
||||
|
||||
TORCH_API Allocator* getCPUAllocator();
|
||||
|
||||
static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
|
||||
inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
|
||||
Backend p,
|
||||
ScalarType s) {
|
||||
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
|
||||
p, s);
|
||||
}
|
||||
|
||||
static inline DeprecatedTypeProperties& CPU(ScalarType s) {
|
||||
inline DeprecatedTypeProperties& CPU(ScalarType s) {
|
||||
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
|
||||
Backend::CPU, s);
|
||||
}
|
||||
|
||||
static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
|
||||
inline DeprecatedTypeProperties& CUDA(ScalarType s) {
|
||||
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
|
||||
Backend::CUDA, s);
|
||||
}
|
||||
|
||||
static inline DeprecatedTypeProperties& HIP(ScalarType s) {
|
||||
inline DeprecatedTypeProperties& HIP(ScalarType s) {
|
||||
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
|
||||
Backend::HIP, s);
|
||||
}
|
||||
|
||||
static inline DeprecatedTypeProperties& MPS(ScalarType s) {
|
||||
inline DeprecatedTypeProperties& MPS(ScalarType s) {
|
||||
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
|
||||
Backend::MPS, s);
|
||||
}
|
||||
|
||||
static inline bool hasCUDA() {
|
||||
inline bool hasCUDA() {
|
||||
return globalContext().hasCUDA();
|
||||
}
|
||||
|
||||
static inline bool hasMTIA() {
|
||||
inline bool hasMTIA() {
|
||||
return globalContext().hasMTIA();
|
||||
}
|
||||
|
||||
static inline bool hasHIP() {
|
||||
inline bool hasHIP() {
|
||||
return globalContext().hasHIP();
|
||||
}
|
||||
|
||||
static inline bool hasIPU() {
|
||||
inline bool hasIPU() {
|
||||
return globalContext().hasIPU();
|
||||
}
|
||||
|
||||
static inline bool hasXLA() {
|
||||
inline bool hasXLA() {
|
||||
return globalContext().hasXLA();
|
||||
}
|
||||
|
||||
static inline bool hasMPS() {
|
||||
inline bool hasMPS() {
|
||||
return globalContext().hasMPS();
|
||||
}
|
||||
|
||||
static inline bool hasMAIA() {
|
||||
inline bool hasMAIA() {
|
||||
return globalContext().hasMAIA();
|
||||
}
|
||||
|
||||
static inline bool hasXPU() {
|
||||
inline bool hasXPU() {
|
||||
return globalContext().hasXPU();
|
||||
}
|
||||
|
||||
// Despite its name, this function returns the number of *CUDA* GPUs.
|
||||
static inline size_t getNumGPUs() {
|
||||
inline size_t getNumGPUs() {
|
||||
// WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
|
||||
// FUNCTION. If you are interested in interrogating the number of
|
||||
// devices for a specific device type, add that function to the
|
||||
|
|
@ -517,27 +517,27 @@ static inline size_t getNumGPUs() {
|
|||
}
|
||||
}
|
||||
|
||||
static inline bool hasOpenMP() {
|
||||
inline bool hasOpenMP() {
|
||||
return globalContext().hasOpenMP();
|
||||
}
|
||||
|
||||
static inline bool hasMKL() {
|
||||
inline bool hasMKL() {
|
||||
return globalContext().hasMKL();
|
||||
}
|
||||
|
||||
static inline bool hasLAPACK() {
|
||||
inline bool hasLAPACK() {
|
||||
return globalContext().hasLAPACK();
|
||||
}
|
||||
|
||||
static inline bool hasMAGMA() {
|
||||
inline bool hasMAGMA() {
|
||||
return globalContext().hasMAGMA();
|
||||
}
|
||||
|
||||
static inline bool hasMKLDNN() {
|
||||
inline bool hasMKLDNN() {
|
||||
return globalContext().hasMKLDNN();
|
||||
}
|
||||
|
||||
static inline void manual_seed(uint64_t seed) {
|
||||
inline void manual_seed(uint64_t seed) {
|
||||
auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
|
||||
{
|
||||
// See Note [Acquire lock when using random generators]
|
||||
|
|
|
|||
|
|
@ -499,7 +499,7 @@ inline Tensor sum_to(
|
|||
return _sum_to(std::move(tensor), shape, always_return_non_view);
|
||||
}
|
||||
|
||||
static inline bool is_expandable_to(
|
||||
inline bool is_expandable_to(
|
||||
SymIntArrayRef shape,
|
||||
c10::SymIntArrayRef desired) {
|
||||
size_t ndim = shape.size();
|
||||
|
|
@ -517,7 +517,7 @@ static inline bool is_expandable_to(
|
|||
return true;
|
||||
}
|
||||
|
||||
static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
|
||||
inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
|
||||
auto sym_shape = c10::SymIntArrayRef(
|
||||
reinterpret_cast<const c10::SymInt*>(shape.data()), shape.size());
|
||||
auto sym_desired = c10::SymIntArrayRef(
|
||||
|
|
|
|||
|
|
@ -34,13 +34,13 @@ namespace at {
|
|||
_(!=, x.ne(y), y.ne(x))
|
||||
|
||||
#define DEFINE_OPERATOR(op, body, reverse_scalar_body) \
|
||||
static inline Tensor operator op(const Tensor& x, const Tensor& y) { \
|
||||
inline Tensor operator op(const Tensor& x, const Tensor& y) { \
|
||||
return body; \
|
||||
} \
|
||||
static inline Tensor operator op(const Tensor& x, const Scalar& y) { \
|
||||
inline Tensor operator op(const Tensor& x, const Scalar& y) { \
|
||||
return body; \
|
||||
} \
|
||||
static inline Tensor operator op(const Scalar& x, const Tensor& y) { \
|
||||
inline Tensor operator op(const Scalar& x, const Tensor& y) { \
|
||||
return reverse_scalar_body; \
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -113,12 +113,12 @@
|
|||
|
||||
namespace at::tracer::impl {
|
||||
|
||||
static inline bool is_dispatch_enabled() {
|
||||
inline bool is_dispatch_enabled() {
|
||||
return c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Tracer) &&
|
||||
!c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer);
|
||||
}
|
||||
|
||||
static inline void set_dispatch_enabled(bool enabled) {
|
||||
inline void set_dispatch_enabled(bool enabled) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
!c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer),
|
||||
"Cannot enable tracing within the scope of NoTracerDispatchMode!");
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ TORCH_API int _crash_if_asan(int);
|
|||
// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
|
||||
// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
|
||||
// Once cat is ported entirely to ATen this can be deleted!
|
||||
static inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
|
||||
inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
|
||||
ArrayRef<Tensor> tensors,
|
||||
const char* name,
|
||||
int pos,
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ namespace at {
|
|||
|
||||
constexpr size_t dim_bitset_size = 64;
|
||||
|
||||
static inline std::bitset<dim_bitset_size> dim_list_to_bitset(
|
||||
inline std::bitset<dim_bitset_size> dim_list_to_bitset(
|
||||
OptionalIntArrayRef opt_dims,
|
||||
size_t ndims) {
|
||||
TORCH_CHECK(
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ TORCH_API std::ostream& print(
|
|||
std::ostream& stream,
|
||||
const Tensor& tensor,
|
||||
int64_t linesize);
|
||||
static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
|
||||
inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
|
||||
return print(out,t,80);
|
||||
}
|
||||
TORCH_API void print(const Tensor & t, int64_t linesize=80);
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ torch::jit::Stack boxArgs(Args... args) {
|
|||
}
|
||||
|
||||
template <class T>
|
||||
static inline constexpr size_t boxed_size_one() {
|
||||
inline constexpr size_t boxed_size_one() {
|
||||
static_assert(!std::is_same<std::decay_t<T>, c10::TensorOptions>::value, "need to patch this path to support TensorOptions passed by reference");
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ struct Atomic##NAME##IntegerImpl<T, 8> {
|
|||
|
||||
|
||||
# define GPU_ATOMIC_INTEGER(NAME, OP, DTYPE) \
|
||||
static inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) { \
|
||||
inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) { \
|
||||
Atomic##NAME##IntegerImpl<DTYPE, sizeof(DTYPE)>()(address, \
|
||||
val, \
|
||||
[](DTYPE a, DTYPE b) { \
|
||||
|
|
@ -171,7 +171,7 @@ ATOMIC_INTEGER_IMPL(Add)
|
|||
GPU_ATOMIC_INTEGER(Add, a || b, bool)
|
||||
|
||||
// Don't instantiate gpuAtomicAdd with the macro as it seems non-standard (see int32, int64)
|
||||
static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
|
||||
inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
|
||||
AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address,
|
||||
val,
|
||||
[](uint8_t a, uint8_t b) {
|
||||
|
|
@ -179,7 +179,7 @@ static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
|
|||
});
|
||||
}
|
||||
|
||||
static inline __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
|
||||
inline __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
|
||||
AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address,
|
||||
val,
|
||||
[](int8_t a, int8_t b) {
|
||||
|
|
@ -187,7 +187,7 @@ static inline __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
|
|||
});
|
||||
}
|
||||
|
||||
static inline __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
|
||||
inline __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
|
||||
AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address,
|
||||
val,
|
||||
[](int16_t a, int16_t b) {
|
||||
|
|
@ -195,11 +195,11 @@ static inline __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
|
|||
});
|
||||
}
|
||||
|
||||
static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
|
||||
inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
|
||||
return atomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
|
||||
inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
|
||||
#if defined(USE_ROCM)
|
||||
__atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
||||
#else
|
||||
|
|
@ -208,7 +208,7 @@ static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
|
||||
inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
|
||||
#if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
|
||||
return AtomicFPOp<at::Half>()(address, val,
|
||||
[](at::Half hsum, at::Half val) {
|
||||
|
|
@ -219,7 +219,7 @@ static inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val)
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
|
||||
inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
|
||||
#if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
|
||||
return AtomicFPOp<at::BFloat16>()(address, val,
|
||||
[](at::BFloat16 bsum, at::BFloat16 val) {
|
||||
|
|
@ -233,7 +233,7 @@ return AtomicFPOp<at::BFloat16>()(address, val,
|
|||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
|
||||
// from CUDA C Programmic Guide
|
||||
static inline __device__ double atomicAdd(double* address, double val)
|
||||
inline __device__ double atomicAdd(double* address, double val)
|
||||
#if defined(__clang__) && defined(__CUDA__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wgcc-compat"
|
||||
|
|
@ -261,20 +261,20 @@ static inline __device__ double atomicAdd(double* address, double val)
|
|||
|
||||
#if defined(USE_ROCM) && __hcc_workweek__ < 18312 && !__HIP__
|
||||
// This needs to be defined for the host side pass
|
||||
static inline __device__ double atomicAdd(double *address, double val) { }
|
||||
inline __device__ double atomicAdd(double *address, double val) { }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline __device__ double gpuAtomicAdd(double *address, double val) {
|
||||
inline __device__ double gpuAtomicAdd(double *address, double val) {
|
||||
return atomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ float gpuAtomicAdd(float *address, float val) {
|
||||
inline __device__ float gpuAtomicAdd(float *address, float val) {
|
||||
return atomicAdd(address, val);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
|
||||
inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
|
||||
gpuAtomicAdd(&address->real_, val.real_);
|
||||
gpuAtomicAdd(&address->imag_, val.imag_);
|
||||
}
|
||||
|
|
@ -285,31 +285,31 @@ static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::comple
|
|||
* directly and require non-library provided data type support. Only for these, we
|
||||
* continue to provide atomicAdd overloads.
|
||||
*/
|
||||
static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
|
||||
inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
|
||||
return gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
|
||||
inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
|
||||
return gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
|
||||
inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
|
||||
gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ void atomicAdd(int8_t *address, int8_t val) {
|
||||
inline __device__ void atomicAdd(int8_t *address, int8_t val) {
|
||||
gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ void atomicAdd(int16_t *address, int16_t val) {
|
||||
inline __device__ void atomicAdd(int16_t *address, int16_t val) {
|
||||
gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
|
||||
inline __device__ void atomicAdd(int64_t *address, int64_t val) {
|
||||
gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
static inline __device__ void atomicAdd(bool *address, bool val) {
|
||||
inline __device__ void atomicAdd(bool *address, bool val) {
|
||||
gpuAtomicAdd(address, val);
|
||||
}
|
||||
|
||||
|
|
@ -321,20 +321,20 @@ static inline __device__ void atomicAdd(bool *address, bool val) {
|
|||
* therefore we need a new API 'gpuAtomicAddNoReturn'.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
|
||||
static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
|
||||
|
||||
/* Special case fp32 atomic. */
|
||||
#if defined(USE_ROCM)
|
||||
static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
|
||||
inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
|
||||
#if defined(__gfx908__)
|
||||
atomicAddNoRet(address, val);
|
||||
#else
|
||||
|
|
@ -342,7 +342,7 @@ static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
|
|||
#endif
|
||||
}
|
||||
#else
|
||||
static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
|
||||
inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
|
||||
#endif
|
||||
|
||||
// Atomic multiplication implementation.
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ namespace {
|
|||
|
||||
// checks whether index.dtype == int64
|
||||
// and self.dtype == src.dtype if src is a Tensor
|
||||
static void scatter_gather_dtype_check(
|
||||
inline void scatter_gather_dtype_check(
|
||||
const std::string& method_name,
|
||||
const Tensor& self,
|
||||
const Tensor& index,
|
||||
|
|
@ -38,7 +38,7 @@ static void scatter_gather_dtype_check(
|
|||
// Test:
|
||||
// 1. index.size(d) <= self.size(d) for all d != dim
|
||||
// 2. index.dim() == self.dim()
|
||||
static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
|
||||
inline void gather_shape_check(const Tensor& self, int64_t dim,
|
||||
const Tensor& index
|
||||
) {
|
||||
auto self_dims = ensure_nonempty_dim(self.dim());
|
||||
|
|
@ -64,7 +64,7 @@ static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
|
|||
// 1. index.size(d) <= self.size(d) for all d != dim
|
||||
// 2. index.size(d) <= src.size(d) for all d if src is a Tensor
|
||||
// 3. index.dim() == self.dim() == src.dim()
|
||||
static C10_UNUSED void scatter_shape_check(
|
||||
inline void scatter_shape_check(
|
||||
const Tensor& self, int64_t dim, const Tensor& index,
|
||||
const std::optional<Tensor>& src_opt = std::nullopt
|
||||
) {
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ using detail::GridSamplerPadding;
|
|||
// +1 --> (size - 1) + 0.5 == size - 0.5
|
||||
// scale_factor = size / 2
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
|
||||
if (align_corners) {
|
||||
// unnormalize coord from [-1, 1] to [0, size - 1]
|
||||
|
|
@ -34,7 +34,7 @@ scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners)
|
|||
// `grad_in`.
|
||||
// This is useful in the backward pass of grid_sampler.
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
|
||||
bool align_corners, scalar_t *grad_in) {
|
||||
if (align_corners) {
|
||||
|
|
@ -50,7 +50,7 @@ scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
|
|||
|
||||
// Clips coordinates to between 0 and clip_limit - 1
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t clip_coordinates(scalar_t in, int clip_limit) {
|
||||
return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
|
||||
}
|
||||
|
|
@ -59,7 +59,7 @@ scalar_t clip_coordinates(scalar_t in, int clip_limit) {
|
|||
// it also returns the `d output / d input` via pointer argument `grad_in`.
|
||||
// This is useful in the backward pass of grid_sampler.
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) {
|
||||
// Note that it is important for the gradient calculation that borders
|
||||
// are considered out of bounds.
|
||||
|
|
@ -82,7 +82,7 @@ scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_i
|
|||
// The bounds are passed as twice their value so that half-integer values
|
||||
// can be represented as ints.
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
|
||||
if (twice_low == twice_high) {
|
||||
return static_cast<scalar_t>(0);
|
||||
|
|
@ -105,7 +105,7 @@ scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
|
|||
// `grad_in`.
|
||||
// This is useful in the backward pass of grid_sampler.
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high,
|
||||
scalar_t *grad_in) {
|
||||
if (twice_low == twice_high) {
|
||||
|
|
@ -135,7 +135,7 @@ scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high
|
|||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t safe_downgrade_to_int_range(scalar_t x){
|
||||
// -100.0 does not have special meaning. This is just to make sure
|
||||
// it's not within_bounds_2d or within_bounds_3d, and does not cause
|
||||
|
|
@ -146,7 +146,7 @@ scalar_t safe_downgrade_to_int_range(scalar_t x){
|
|||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t compute_coordinates(scalar_t coord, int size,
|
||||
GridSamplerPadding padding_mode,
|
||||
bool align_corners) {
|
||||
|
|
@ -170,7 +170,7 @@ scalar_t compute_coordinates(scalar_t coord, int size,
|
|||
|
||||
// Computes the pixel source index value for a grid coordinate
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t grid_sampler_compute_source_index(
|
||||
scalar_t coord,
|
||||
int size,
|
||||
|
|
@ -186,7 +186,7 @@ scalar_t grid_sampler_compute_source_index(
|
|||
// `d output / d input` via pointer argument `grad_in`.
|
||||
// This is useful in the backward pass of grid_sampler.
|
||||
template <typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t grid_sampler_compute_source_index_set_grad(
|
||||
scalar_t coord,
|
||||
int size,
|
||||
|
|
@ -215,18 +215,18 @@ scalar_t grid_sampler_compute_source_index_set_grad(
|
|||
return coord;
|
||||
}
|
||||
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
bool within_bounds_2d(int h, int w, int H, int W) {
|
||||
return h >= 0 && h < H && w >= 0 && w < W;
|
||||
}
|
||||
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
|
||||
return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
scalar_t get_value_bounded(
|
||||
const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
|
||||
GridSamplerPadding padding_mode,
|
||||
|
|
@ -245,7 +245,7 @@ scalar_t get_value_bounded(
|
|||
}
|
||||
|
||||
template<typename scalar_t, typename index_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
void safe_add_2d(scalar_t *data, int h, int w,
|
||||
int sH, int sW, int H, int W,
|
||||
scalar_t delta,
|
||||
|
|
@ -261,7 +261,7 @@ void safe_add_2d(scalar_t *data, int h, int w,
|
|||
}
|
||||
|
||||
template<typename scalar_t, typename index_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
void safe_add_3d(scalar_t *data, int d, int h, int w,
|
||||
int sD, int sH, int sW, int D, int H, int W,
|
||||
scalar_t delta,
|
||||
|
|
@ -277,7 +277,7 @@ void safe_add_3d(scalar_t *data, int d, int h, int w,
|
|||
}
|
||||
|
||||
template<typename scalar_t, typename index_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
void add_value_bounded(
|
||||
scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
|
||||
scalar_t delta,
|
||||
|
|
@ -297,7 +297,7 @@ void add_value_bounded(
|
|||
|
||||
// Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
|
||||
template<typename scalar_t>
|
||||
static __forceinline__ __device__
|
||||
__forceinline__ __device__
|
||||
void get_cubic_coefficients_grad(
|
||||
scalar_t coeffs[4],
|
||||
scalar_t t) {
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ constexpr int MAX_BLOCK_SIZE = 1024;
|
|||
// Maximum size per grid dimension that we assume (compute capability >= 2.0)
|
||||
constexpr int64_t MAX_GRID_SIZE = 65535LL;
|
||||
|
||||
static bool getGridFromTiles(int64_t gridTiles, dim3& grid) {
|
||||
inline bool getGridFromTiles(int64_t gridTiles, dim3& grid) {
|
||||
if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -92,7 +92,7 @@ struct GlobalIndexToPerSliceIndex {
|
|||
};
|
||||
|
||||
// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
|
||||
static uint64_t nextHighestPowerOf2(uint64_t n) {
|
||||
inline uint64_t nextHighestPowerOf2(uint64_t n) {
|
||||
n--;
|
||||
n |= n >> 1;
|
||||
n |= n >> 2;
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ __device__ inline scalar_t max(scalar_t a, scalar_t b) {
|
|||
|
||||
// see NOTE [ Nearest neighbor upsampling kernel implementation ]
|
||||
template <typename accscalar_t>
|
||||
__host__ __forceinline__ static accscalar_t compute_scales_value(
|
||||
__host__ __forceinline__ accscalar_t compute_scales_value(
|
||||
const std::optional<double> scale,
|
||||
int64_t src_size,
|
||||
int64_t dst_size) {
|
||||
|
|
@ -83,7 +83,7 @@ __host__ __forceinline__ static accscalar_t compute_scales_value(
|
|||
|
||||
// see NOTE [ Nearest neighbor upsampling kernel implementation ]
|
||||
template <typename accscalar_t>
|
||||
__host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
|
||||
__host__ __forceinline__ accscalar_t compute_scales_value_backwards(
|
||||
const std::optional<double> scale,
|
||||
int64_t src_size,
|
||||
int64_t dst_size) {
|
||||
|
|
@ -93,7 +93,7 @@ __host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
|
|||
}
|
||||
|
||||
template <typename accscalar_t>
|
||||
__host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
|
||||
__host__ __forceinline__ accscalar_t area_pixel_compute_scale(
|
||||
int input_size,
|
||||
int output_size,
|
||||
bool align_corners,
|
||||
|
|
@ -112,7 +112,7 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
|
|||
}
|
||||
|
||||
template <typename accscalar_t>
|
||||
__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
|
||||
__device__ __forceinline__ accscalar_t area_pixel_compute_source_index(
|
||||
accscalar_t scale,
|
||||
int dst_index,
|
||||
bool align_corners,
|
||||
|
|
@ -130,7 +130,7 @@ __device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
|
|||
}
|
||||
|
||||
// see NOTE [ Nearest neighbor upsampling kernel implementation ]
|
||||
__device__ __forceinline__ static int nearest_neighbor_compute_source_index(
|
||||
__device__ __forceinline__ int nearest_neighbor_compute_source_index(
|
||||
const float scale,
|
||||
int dst_index,
|
||||
int input_size) {
|
||||
|
|
@ -144,7 +144,7 @@ __device__ __forceinline__ static int nearest_neighbor_compute_source_index(
|
|||
return src_index;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ static int nearest_neighbor_exact_compute_source_index(
|
||||
__device__ __forceinline__ int nearest_neighbor_exact_compute_source_index(
|
||||
const float scale,
|
||||
int dst_index,
|
||||
int input_size) {
|
||||
|
|
@ -157,7 +157,7 @@ __device__ __forceinline__ static int nearest_neighbor_exact_compute_source_inde
|
|||
}
|
||||
|
||||
// see NOTE [ Nearest neighbor upsampling kernel implementation ]
|
||||
__device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
|
||||
__device__ __forceinline__ int nearest_neighbor_bw_compute_source_index(
|
||||
const float scale,
|
||||
int dst_index,
|
||||
int output_size) {
|
||||
|
|
@ -170,7 +170,7 @@ __device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
|
|||
}
|
||||
|
||||
// see NOTE [ Nearest neighbor upsampling kernel implementation ]
|
||||
__device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_index(
|
||||
__device__ __forceinline__ int nearest_neighbor_exact_bw_compute_source_index(
|
||||
const float scale,
|
||||
int dst_index,
|
||||
int output_size) {
|
||||
|
|
@ -182,7 +182,7 @@ __device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_i
|
|||
|
||||
/* Used by UpSampleBicubic2d.cu */
|
||||
template <typename scalar_t>
|
||||
__device__ __forceinline__ static scalar_t upsample_get_value_bounded(
|
||||
__device__ __forceinline__ scalar_t upsample_get_value_bounded(
|
||||
const PackedTensorAccessor64<const scalar_t, 4>& data,
|
||||
int batch,
|
||||
int channel,
|
||||
|
|
@ -197,7 +197,7 @@ __device__ __forceinline__ static scalar_t upsample_get_value_bounded(
|
|||
|
||||
/* Used by UpSampleBicubic2d.cu */
|
||||
template <typename scalar_t, typename accscalar_t>
|
||||
__device__ __forceinline__ static void upsample_increment_value_bounded(
|
||||
__device__ __forceinline__ void upsample_increment_value_bounded(
|
||||
PackedTensorAccessor64<scalar_t, 4>& data,
|
||||
int batch,
|
||||
int channel,
|
||||
|
|
@ -218,21 +218,21 @@ __device__ __forceinline__ static void upsample_increment_value_bounded(
|
|||
// Based on
|
||||
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
||||
template <typename accscalar_t>
|
||||
__device__ __forceinline__ static accscalar_t cubic_convolution1(
|
||||
__device__ __forceinline__ accscalar_t cubic_convolution1(
|
||||
accscalar_t x,
|
||||
accscalar_t A) {
|
||||
return ((A + 2) * x - (A + 3)) * x * x + 1;
|
||||
}
|
||||
|
||||
template <typename accscalar_t>
|
||||
__device__ __forceinline__ static accscalar_t cubic_convolution2(
|
||||
__device__ __forceinline__ accscalar_t cubic_convolution2(
|
||||
accscalar_t x,
|
||||
accscalar_t A) {
|
||||
return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
|
||||
}
|
||||
|
||||
template <typename accscalar_t>
|
||||
__device__ __forceinline__ static void get_cubic_upsampling_coefficients(
|
||||
__device__ __forceinline__ void get_cubic_upsampling_coefficients(
|
||||
accscalar_t coeffs[4],
|
||||
accscalar_t t) {
|
||||
accscalar_t A = -0.75;
|
||||
|
|
@ -248,7 +248,7 @@ __device__ __forceinline__ static void get_cubic_upsampling_coefficients(
|
|||
}
|
||||
|
||||
template <typename scalar_t, typename accscalar_t>
|
||||
__device__ __forceinline__ static accscalar_t cubic_interp1d(
|
||||
__device__ __forceinline__ accscalar_t cubic_interp1d(
|
||||
scalar_t x0,
|
||||
scalar_t x1,
|
||||
scalar_t x2,
|
||||
|
|
@ -306,7 +306,7 @@ struct BicubicFilterFunctor {
|
|||
};
|
||||
|
||||
template <typename accscalar_t>
|
||||
__device__ __forceinline__ static void _compute_weights_span(
|
||||
__device__ __forceinline__ void _compute_weights_span(
|
||||
const int i,
|
||||
const int input_size,
|
||||
const accscalar_t scale,
|
||||
|
|
@ -320,7 +320,7 @@ __device__ __forceinline__ static void _compute_weights_span(
|
|||
}
|
||||
|
||||
template <typename scalar_t, typename accscalar_t, typename interp_filter_t>
|
||||
__device__ __forceinline__ static void _compute_weights(
|
||||
__device__ __forceinline__ void _compute_weights(
|
||||
scalar_t* wt_ptr,
|
||||
const accscalar_t scale,
|
||||
int interp_size,
|
||||
|
|
@ -347,7 +347,7 @@ __device__ __forceinline__ static void _compute_weights(
|
|||
}
|
||||
|
||||
template <typename scalar_t, typename accscalar_t>
|
||||
__device__ __forceinline__ static accscalar_t interpolate_aa_single_dim(
|
||||
__device__ __forceinline__ accscalar_t interpolate_aa_single_dim(
|
||||
const scalar_t* src,
|
||||
const scalar_t* weights,
|
||||
int size) {
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
#ifdef __CUDACC_RTC__
|
||||
#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
|
||||
#else /* __CUDACC_RTC__ */
|
||||
#define __MATH_FUNCTIONS_DECL__ static inline C10_HOST_DEVICE
|
||||
#define __MATH_FUNCTIONS_DECL__ inline C10_HOST_DEVICE
|
||||
#endif /* __CUDACC_RTC__ */
|
||||
#endif /* __HIPCC__ */
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user