mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52240 Test Plan: Sandcastle tests Reviewed By: xush6528 Differential Revision: D26408330 fbshipit-source-id: 60779ba0e38c8f90e0e341c8faa2661e631112dd
96 lines
2.5 KiB
Plaintext
96 lines
2.5 KiB
Plaintext
#include "caffe2/operators/half_float_ops.h"
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
|
|
#ifdef CAFFE_HAS_CUDA_FP16
|
|
|
|
namespace caffe2 {
|
|
namespace {
|
|
__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
Y[i] = __float2half(X[i]);
|
|
}
|
|
}
|
|
|
|
__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
Y[i] = __half2float(X[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
bool FloatToHalfOp<CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0);
|
|
|
|
auto* Y = Output(0, X.sizes(), at::dtype<at::Half>());
|
|
FloatToHalfKernel<<<
|
|
CAFFE_GET_BLOCKS(X.numel()),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
X.numel(),
|
|
X.data<float>(),
|
|
reinterpret_cast<half*>(Y->template mutable_data<at::Half>()));
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool HalfToFloatOp<CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0);
|
|
|
|
auto* Y = Output(0, X.sizes(), at::dtype<float>());
|
|
HalfToFloatKernel<<<
|
|
CAFFE_GET_BLOCKS(X.numel()),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
X.numel(),
|
|
reinterpret_cast<const half*>(X.data<at::Half>()),
|
|
Y->template mutable_data<float>());
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool Float16UniformFillOp<CUDAContext>::RunOnDevice() {
|
|
auto* output = Output(0, shape_, at::dtype<at::Half>());
|
|
at::Half* out = output->template mutable_data<at::Half>();
|
|
|
|
auto leading_dim_sz = output->size(0);
|
|
CAFFE_ENFORCE_GT(leading_dim_sz, 0,
|
|
"The input shape should have the first dimension greater than 0");
|
|
int rowsz = output->numel() / output->size(0);
|
|
|
|
ReinitializeTensor(
|
|
&temp_data_buffer_, {rowsz}, at::dtype<float>().device(CUDA));
|
|
float* temp_data = temp_data_buffer_.template mutable_data<float>();
|
|
|
|
for (uint64_t i = 0; i < leading_dim_sz; i++) {
|
|
math::RandUniform<float, CUDAContext>(
|
|
rowsz, min_, max_, temp_data, &context_);
|
|
|
|
FloatToHalfKernel<<<
|
|
CAFFE_GET_BLOCKS(rowsz),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
rowsz,
|
|
temp_data,
|
|
reinterpret_cast<half*>(out + i * rowsz));
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(FloatToHalf, FloatToHalfOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(HalfToFloat, HalfToFloatOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(Float16UniformFill, Float16UniformFillOp<CUDAContext>);
|
|
} // namespace caffe2
|
|
|
|
#endif // CAFFE_HAS_CUDA_FP16
|