mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44913 Pull Request resolved: https://github.com/pytorch/pytorch/pull/18322 Optimize Scale function i-am-not-moving-c2-to-c10 Test Plan: buck test mode/dbg caffe2/caffe2/python/operator_test:weighted_sum_test Reviewed By: BIT-silence Differential Revision: D14575780 fbshipit-source-id: db333a7964581dcaff6e432ff1d6b517ba1a075f
907 lines
49 KiB
Plaintext
907 lines
49 KiB
Plaintext
#include "caffe2/utils/math/elementwise.h"
|
|
|
|
#include <type_traits>
|
|
|
|
#include <thrust/execution_policy.h>
|
|
#include <thrust/fill.h>
|
|
#include <thrust/functional.h>
|
|
#include <thrust/transform.h>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/conversions.h"
|
|
#include "caffe2/utils/math/half_utils.h"
|
|
#include "caffe2/utils/math/utils.h"
|
|
|
|
namespace caffe2 {
|
|
namespace math {
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
|
|
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (i < N) {
|
|
#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
|
|
c10::cuda::compat::sincos(__ldg(X + i), S + i, C + i);
|
|
#else
|
|
c10::cuda::compat::sincos(X[i], S + i, C + i);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
|
|
template <typename TAlpha, typename TData>
|
|
__global__ void AxpyCUDAKernel(
|
|
const std::int64_t N,
|
|
const TAlpha alpha,
|
|
const TData* X,
|
|
TData* Y) {
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) *
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) +
|
|
static_cast<int64_t>(threadIdx.x);
|
|
if (index < N) {
|
|
Y[index] += static_cast<TData>(alpha) * __ldg(X + index);
|
|
}
|
|
}
|
|
|
|
template <typename TAlpha, typename TData>
|
|
__global__ void AxpyCUDAKernel(
|
|
const std::int64_t N,
|
|
const TAlpha* alpha,
|
|
const TData* X,
|
|
TData* Y) {
|
|
__shared__ TData a;
|
|
if (threadIdx.x == 0) {
|
|
a = static_cast<TData>(__ldg(alpha));
|
|
}
|
|
__syncthreads();
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) *
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) +
|
|
static_cast<int64_t>(threadIdx.x);
|
|
if (index < N) {
|
|
Y[index] += a * __ldg(X + index);
|
|
}
|
|
}
|
|
|
|
#define DELEGATE_HALF_AXPY_CUDA_KERNEL(TAlpha, FMAFunc) \
|
|
template <> \
|
|
__global__ void AxpyCUDAKernel<TAlpha, at::Half>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const at::Half* X, \
|
|
at::Half* Y) { \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = convert::To<TAlpha, at::Half>(FMAFunc( \
|
|
alpha, \
|
|
convert::To<at::Half, TAlpha>(X[index]), \
|
|
convert::To<at::Half, TAlpha>(Y[index]))); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
__global__ void AxpyCUDAKernel<TAlpha, at::Half>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const at::Half* X, \
|
|
at::Half* Y) { \
|
|
__shared__ TAlpha a; \
|
|
if (threadIdx.x == 0) { \
|
|
a = __ldg(alpha); \
|
|
} \
|
|
__syncthreads(); \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = convert::To<TAlpha, at::Half>(FMAFunc( \
|
|
a, \
|
|
convert::To<at::Half, TAlpha>(X[index]), \
|
|
convert::To<at::Half, TAlpha>(Y[index]))); \
|
|
} \
|
|
}
|
|
DELEGATE_HALF_AXPY_CUDA_KERNEL(float, fmaf)
|
|
#undef DELEGATE_HALF_AXPY_CUDA_KERNEL
|
|
|
|
#endif // __HIP_PLATFORM_HCC__
|
|
|
|
template <typename TAlpha, typename TData>
|
|
__global__ void AxpbyCUDAKernel(
|
|
const std::int64_t N,
|
|
const TAlpha alpha,
|
|
const TData* X,
|
|
const TAlpha beta,
|
|
TData* Y);
|
|
|
|
template <typename TAlpha, typename TData>
|
|
__global__ void AxpbyCUDAKernel(
|
|
const std::int64_t N,
|
|
const TAlpha* alpha,
|
|
const TData* X,
|
|
const TAlpha* beta,
|
|
TData* Y);
|
|
|
|
#define DELEGATE_AXPBY_CUDA_KERNEL(TAlpha, TData, FMAFunc) \
|
|
template <> \
|
|
__global__ void AxpbyCUDAKernel<TAlpha, TData>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const TData* X, \
|
|
const TAlpha beta, \
|
|
TData* Y) { \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = FMAFunc( \
|
|
static_cast<TData>(alpha), \
|
|
X[index], \
|
|
static_cast<TData>(beta) * Y[index]); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
__global__ void AxpbyCUDAKernel<TAlpha, TData>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const TData* X, \
|
|
const TAlpha* beta, \
|
|
TData* Y) { \
|
|
__shared__ TData a; \
|
|
__shared__ TData b; \
|
|
if (threadIdx.x == 0) { \
|
|
a = static_cast<TData>(*alpha); \
|
|
b = static_cast<TData>(*beta); \
|
|
} \
|
|
__syncthreads(); \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = FMAFunc(a, X[index], b * Y[index]); \
|
|
} \
|
|
}
|
|
DELEGATE_AXPBY_CUDA_KERNEL(float, float, fmaf)
|
|
DELEGATE_AXPBY_CUDA_KERNEL(float, double, fma)
|
|
#undef DELEGATE_AXPBY_CUDA_KERNEL
|
|
|
|
#define DELEGATE_HALF_AXPBY_CUDA_KERNEL(TAlpha, FMAFunc) \
|
|
template <> \
|
|
__global__ void AxpbyCUDAKernel<TAlpha, at::Half>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const at::Half* X, \
|
|
const TAlpha beta, \
|
|
at::Half* Y) { \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = convert::To<TAlpha, at::Half>(FMAFunc( \
|
|
alpha, \
|
|
convert::To<at::Half, TAlpha>(X[index]), \
|
|
beta * convert::To<at::Half, TAlpha>(Y[index]))); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
__global__ void AxpbyCUDAKernel<TAlpha, at::Half>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const at::Half* X, \
|
|
const TAlpha* beta, \
|
|
at::Half* Y) { \
|
|
__shared__ TAlpha a; \
|
|
__shared__ TAlpha b; \
|
|
if (threadIdx.x == 0) { \
|
|
a = *alpha; \
|
|
b = *beta; \
|
|
} \
|
|
__syncthreads(); \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = convert::To<TAlpha, at::Half>(FMAFunc( \
|
|
a, \
|
|
convert::To<at::Half, TAlpha>(X[index]), \
|
|
b * convert::To<at::Half, TAlpha>(Y[index]))); \
|
|
} \
|
|
}
|
|
DELEGATE_HALF_AXPBY_CUDA_KERNEL(float, fmaf)
|
|
#undef DELEGATE_HALF_AXPBY_CUDA_KERNEL
|
|
|
|
template <typename TAlpha, typename TData>
|
|
__global__ void ScaleCUDAKernel(
|
|
const std::int64_t N,
|
|
const TAlpha alpha,
|
|
const TData* X,
|
|
TData* Y);
|
|
|
|
template <typename TAlpha, typename TData>
|
|
__global__ void ScaleCUDAKernel(
|
|
const std::int64_t N,
|
|
const TAlpha* alpha,
|
|
const TData* X,
|
|
TData* Y);
|
|
|
|
#define CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL(TAlpha, TData) \
|
|
template <> \
|
|
__global__ void ScaleCUDAKernel<TAlpha, TData>( \
|
|
const std::int64_t N, const TAlpha alpha, const TData* X, TData* Y) { \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = static_cast<TData>(alpha) * X[index]; \
|
|
} \
|
|
} \
|
|
template <> \
|
|
__global__ void ScaleCUDAKernel<TAlpha, TData>( \
|
|
const std::int64_t N, const TAlpha* alpha, const TData* X, TData* Y) { \
|
|
__shared__ TData a; \
|
|
if (threadIdx.x == 0) { \
|
|
a = static_cast<TData>(*alpha); \
|
|
} \
|
|
__syncthreads(); \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = a * X[index]; \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL(float, float)
|
|
CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL(double, double)
|
|
CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL(float, double)
|
|
CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL(std::int32_t, std::int32_t)
|
|
CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL(std::int64_t, std::int64_t)
|
|
#undef CAFFE2_SPECIALIZED_SCALE_CUDA_KERNEL
|
|
|
|
#define CAFFE2_SPECIALIZED_HALF_SCALE_CUDA_KERNEL(TAlpha) \
|
|
template <> \
|
|
__global__ void ScaleCUDAKernel<TAlpha, at::Half>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const at::Half* X, \
|
|
at::Half* Y) { \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = convert::To<TAlpha, at::Half>( \
|
|
alpha * convert::To<at::Half, TAlpha>(X[index])); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
__global__ void ScaleCUDAKernel<TAlpha, at::Half>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const at::Half* X, \
|
|
at::Half* Y) { \
|
|
__shared__ TAlpha a; \
|
|
if (threadIdx.x == 0) { \
|
|
a = *alpha; \
|
|
} \
|
|
__syncthreads(); \
|
|
const int64_t index = static_cast<int64_t>(blockIdx.x) * \
|
|
static_cast<int64_t>(CAFFE_CUDA_NUM_THREADS) + \
|
|
static_cast<int64_t>(threadIdx.x); \
|
|
if (index < N) { \
|
|
Y[index] = convert::To<TAlpha, at::Half>( \
|
|
a * convert::To<at::Half, TAlpha>(X[index])); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_HALF_SCALE_CUDA_KERNEL(float)
|
|
#undef CAFFE2_SPECIALIZED_HALF_SCALE_CUDA_KERNEL
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_SET(T) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Set<T, CUDAContext>( \
|
|
const std::int64_t N, const T alpha, T* Y, CUDAContext* context) { \
|
|
if (N == 0) { \
|
|
return; \
|
|
} \
|
|
if (alpha == T(0)) { \
|
|
cudaMemsetAsync(Y, 0, sizeof(T) * N, context->cuda_stream()); \
|
|
} else { \
|
|
thrust::fill( \
|
|
thrust::cuda::par.on(context->cuda_stream()), Y, Y + N, alpha); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_SET(bool)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(char)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(std::int8_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(std::int16_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(std::int32_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(std::int64_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(std::uint8_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(std::uint16_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(float)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(double)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(at::Half)
|
|
CAFFE2_SPECIALIZED_CUDA_SET(at::BFloat16)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_SET
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, DeviceFunc) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
|
|
const int N, const T* X, T* Y, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
thrust::transform( \
|
|
thrust::cuda::par.on(context->cuda_stream()), \
|
|
X, \
|
|
X + N, \
|
|
Y, \
|
|
[] __device__(const T x) { return DeviceFunc(x); }); \
|
|
} \
|
|
}
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh, sinhf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh, coshf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh, tanhf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv, utils::Inv<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv, utils::Inv<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, utils::Square<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int32_t,
|
|
Cube,
|
|
utils::Cube<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int64_t,
|
|
Cube,
|
|
utils::Cube<std::int64_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube, utils::Cube<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube, utils::Cube<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt, cbrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf, erff)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf, erf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, CdfNorm, normcdff)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, CdfNorm, normcdf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, utils::Not<bool>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int32_t,
|
|
Neg,
|
|
utils::Negate<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int64_t,
|
|
Neg,
|
|
utils::Negate<std::int64_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, utils::Negate<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, utils::Negate<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int32_t,
|
|
Sign,
|
|
utils::Sign<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int64_t,
|
|
Sign,
|
|
utils::Sign<std::int64_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, utils::Sign<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, utils::Sign<double>)
|
|
#undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
|
|
|
|
#define DELEGATE_CUDA_POWX(T, DeviceFunc) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Powx<T, CUDAContext>( \
|
|
const int N, const T* A, const T b, T* Y, CUDAContext* context) { \
|
|
thrust::transform( \
|
|
thrust::cuda::par.on(context->cuda_stream()), \
|
|
A, \
|
|
A + N, \
|
|
Y, \
|
|
[b] __device__(const T x) { return DeviceFunc(x, b); }); \
|
|
}
|
|
DELEGATE_CUDA_POWX(float, powf)
|
|
#undef DELEGATE_CUDA_POWX
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>( \
|
|
const int N, const T* X, T* S, T* C, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
|
|
SinCosCUDAKernel<T> \
|
|
<<<K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, X, S, C); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
|
|
CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
|
|
|
|
#define DELEGATE_CUDA_SCALE(T, CuBLASFunc) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Scale<T, T, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const T alpha, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
if (N == 0) { \
|
|
return; \
|
|
} \
|
|
if (Y == X) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
|
|
CUBLAS_ENFORCE(CuBLASFunc(context->cublas_handle(), N, &alpha, Y, 1)); \
|
|
} else { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
ScaleCUDAKernel<T, T> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Scale<T, T, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const T* alpha, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
if (N == 0) { \
|
|
return; \
|
|
} \
|
|
if (Y == X) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
|
|
CUBLAS_ENFORCE(CuBLASFunc(context->cublas_handle(), N, alpha, Y, 1)); \
|
|
} else { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
ScaleCUDAKernel<T, T> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
} \
|
|
}
|
|
DELEGATE_CUDA_SCALE(float, cublasSscal)
|
|
DELEGATE_CUDA_SCALE(double, cublasDscal)
|
|
#undef DELEGATE_CUDA_SCALE
|
|
|
|
#ifndef __HIP_PLATFORM_HCC__
|
|
|
|
#define DELEGATE_CUDA_SCALE_EX( \
|
|
TAlpha, TData, kAlphaType, kDataType, kExecutionType) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
if (N == 0) { \
|
|
return; \
|
|
} \
|
|
if (Y == X) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
|
|
CUBLAS_ENFORCE(cublasScalEx( \
|
|
context->cublas_handle(), \
|
|
N, \
|
|
&alpha, \
|
|
kAlphaType, \
|
|
Y, \
|
|
kDataType, \
|
|
1, \
|
|
kExecutionType)); \
|
|
} else { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
ScaleCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
if (N == 0) { \
|
|
return; \
|
|
} \
|
|
if (Y == X) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
|
|
CUBLAS_ENFORCE(cublasScalEx( \
|
|
context->cublas_handle(), \
|
|
N, \
|
|
alpha, \
|
|
kAlphaType, \
|
|
Y, \
|
|
kDataType, \
|
|
1, \
|
|
kExecutionType)); \
|
|
} else { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
ScaleCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
} \
|
|
}
|
|
DELEGATE_CUDA_SCALE_EX(float, double, CUDA_R_32F, CUDA_R_64F, CUDA_R_64F)
|
|
DELEGATE_CUDA_SCALE_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F)
|
|
#undef DELEGATE_CUDA_SCALE_EX
|
|
|
|
#endif // __HIP_PLATFORM_HCC__
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_SCALE(TAlpha, TData) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
ScaleCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
ScaleCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, *alpha, X, Y); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_SCALE(std::int32_t, std::int32_t)
|
|
CAFFE2_SPECIALIZED_CUDA_SCALE(std::int64_t, std::int64_t)
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
CAFFE2_SPECIALIZED_CUDA_SCALE(float, double)
|
|
CAFFE2_SPECIALIZED_CUDA_SCALE(float, at::Half)
|
|
#endif // __HIP_PLATFORM_HCC__
|
|
#undef CAFFE2_SPECIALIZED_CUDA_SCALE
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
|
|
const int N, const T* A, const T* B, T* C, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
thrust::transform( \
|
|
thrust::cuda::par.on(context->cuda_stream()), \
|
|
A, \
|
|
A + N, \
|
|
B, \
|
|
C, \
|
|
DeviceFunc); \
|
|
} \
|
|
}
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Add,
|
|
thrust::plus<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Add,
|
|
thrust::plus<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, thrust::plus<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Add, thrust::plus<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Add, utils::HalfAddFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Sub,
|
|
thrust::minus<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Sub,
|
|
thrust::minus<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, thrust::minus<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Sub, thrust::minus<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Sub, utils::HalfSubFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Mul,
|
|
thrust::multiplies<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Mul,
|
|
thrust::multiplies<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, thrust::multiplies<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Mul, thrust::multiplies<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Mul, utils::HalfMulFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Div,
|
|
thrust::divides<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Div,
|
|
thrust::divides<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, thrust::divides<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Div, thrust::divides<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Div, utils::HalfDivFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Min, thrust::minimum<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Min, thrust::minimum<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Max, thrust::maximum<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Max, thrust::maximum<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, And, thrust::logical_and<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Or, thrust::logical_or<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Xor, thrust::bit_xor<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseAnd, thrust::bit_and<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
BitwiseAnd,
|
|
thrust::bit_and<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
BitwiseAnd,
|
|
thrust::bit_and<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseOr, thrust::bit_or<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
BitwiseOr,
|
|
thrust::bit_or<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
BitwiseOr,
|
|
thrust::bit_or<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseXor, thrust::bit_xor<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
BitwiseXor,
|
|
thrust::bit_xor<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
BitwiseXor,
|
|
thrust::bit_xor<std::int64_t>())
|
|
#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(T, Func, DeviceComp) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
|
|
const int N, const T* A, const T* B, bool* C, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
thrust::transform( \
|
|
thrust::cuda::par.on(context->cuda_stream()), \
|
|
A, \
|
|
A + N, \
|
|
B, \
|
|
C, \
|
|
DeviceComp); \
|
|
} \
|
|
}
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, EQ, thrust::equal_to<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
EQ,
|
|
thrust::equal_to<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
EQ,
|
|
thrust::equal_to<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, EQ, thrust::equal_to<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, EQ, thrust::equal_to<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, NE, thrust::not_equal_to<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
NE,
|
|
thrust::not_equal_to<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
NE,
|
|
thrust::not_equal_to<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, NE, thrust::not_equal_to<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
double,
|
|
NE,
|
|
thrust::not_equal_to<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LT, thrust::less<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
LT,
|
|
thrust::less<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
LT,
|
|
thrust::less<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LT, thrust::less<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LT, thrust::less<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LE, thrust::less_equal<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
LE,
|
|
thrust::less_equal<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
LE,
|
|
thrust::less_equal<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LE, thrust::less_equal<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LE, thrust::less_equal<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GT, thrust::greater<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
GT,
|
|
thrust::greater<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
GT,
|
|
thrust::greater<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GT, thrust::greater<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, GT, thrust::greater<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GE, thrust::greater_equal<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
GE,
|
|
thrust::greater_equal<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
GE,
|
|
thrust::greater_equal<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GE, thrust::greater_equal<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
double,
|
|
GE,
|
|
thrust::greater_equal<double>())
|
|
#undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION
|
|
|
|
#define DELEGATE_CUDA_AXPY(T, CuBLASFunc) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpy<T, T, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const T alpha, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
|
|
CUBLAS_ENFORCE( \
|
|
CuBLASFunc(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpy<T, T, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const T* alpha, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
|
|
CUBLAS_ENFORCE( \
|
|
cublasSaxpy(context->cublas_handle(), N, alpha, X, 1, Y, 1)); \
|
|
}
|
|
DELEGATE_CUDA_AXPY(float, cublasSaxpy)
|
|
#undef DELEGATE_CUDA_AXPY
|
|
|
|
#ifndef __HIP_PLATFORM_HCC__
|
|
|
|
#define DELEGATE_CUDA_AXPY_EX( \
|
|
TAlpha, TData, kAlphaType, kDataType, kExecutionType) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
|
|
CUBLAS_ENFORCE(cublasAxpyEx( \
|
|
context->cublas_handle(), \
|
|
N, \
|
|
&alpha, \
|
|
kAlphaType, \
|
|
X, \
|
|
kDataType, \
|
|
1, \
|
|
Y, \
|
|
kDataType, \
|
|
1, \
|
|
kExecutionType)); \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
CUBLAS_ENFORCE(cublasSetPointerMode( \
|
|
context->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
|
|
CUBLAS_ENFORCE(cublasAxpyEx( \
|
|
context->cublas_handle(), \
|
|
N, \
|
|
alpha, \
|
|
kAlphaType, \
|
|
X, \
|
|
kDataType, \
|
|
1, \
|
|
Y, \
|
|
kDataType, \
|
|
1, \
|
|
kExecutionType)); \
|
|
}
|
|
DELEGATE_CUDA_AXPY_EX(float, double, CUDA_R_32F, CUDA_R_64F, CUDA_R_64F)
|
|
DELEGATE_CUDA_AXPY_EX(float, at::Half, CUDA_R_32F, CUDA_R_16F, CUDA_R_32F)
|
|
#undef DELEGATE_CUDA_AXPY_EX
|
|
|
|
#else // __HIP_PLATFORM_HCC__
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_AXPY(TAlpha, TData) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
AxpyCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpy<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
AxpyCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, Y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_AXPY(float, double)
|
|
CAFFE2_SPECIALIZED_CUDA_AXPY(float, at::Half)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_AXPY
|
|
|
|
#endif // __HIP_PLATFORM_HCC__
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_AXPBY(TAlpha, TData) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpby<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha alpha, \
|
|
const TData* X, \
|
|
const TAlpha beta, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
AxpbyCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, beta, Y); \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Axpby<TAlpha, TData, CUDAContext>( \
|
|
const std::int64_t N, \
|
|
const TAlpha* alpha, \
|
|
const TData* X, \
|
|
const TAlpha* beta, \
|
|
TData* Y, \
|
|
CUDAContext* context) { \
|
|
const std::int64_t M = DivUp<std::int64_t>(N, CAFFE_CUDA_NUM_THREADS); \
|
|
AxpbyCUDAKernel<TAlpha, TData> \
|
|
<<<M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, alpha, X, beta, Y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_AXPBY(float, float)
|
|
CAFFE2_SPECIALIZED_CUDA_AXPBY(float, double)
|
|
CAFFE2_SPECIALIZED_CUDA_AXPBY(float, at::Half)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_AXPBY
|
|
|
|
} // namespace math
|
|
} // namespace caffe2
|