mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16753 Separate elementwise level2 math functions i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13954928 fbshipit-source-id: 1ca7a5d3da96e32510f502e5e4e79168854bee67
313 lines
9.1 KiB
Plaintext
313 lines
9.1 KiB
Plaintext
#include "caffe2/operators/utility_ops.h"
|
|
|
|
#include <thrust/device_vector.h>
|
|
#include <thrust/sequence.h>
|
|
#include <thrust/sort.h>
|
|
#include <thrust/system/cuda/execution_policy.h>
|
|
#include <thrust/unique.h>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/flatten_op.h"
|
|
#include "caffe2/utils/math.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <>
|
|
bool WeightedSumOp<CUDAContext>::RunOnDevice() {
|
|
if (Input(0).IsType<float>()) {
|
|
return DoRunWithType<float>();
|
|
} else if (Input(0).IsType<at::Half>()) {
|
|
return DoRunWithType<at::Half>();
|
|
} else {
|
|
CAFFE_THROW("Unsupported inputs");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <>
|
|
bool SumOp<CUDAContext>::RunOnDevice() {
|
|
if (Input(0).IsType<float>()) {
|
|
return DoRunWithType<float, float>();
|
|
} else if (Input(0).IsType<at::Half>()) {
|
|
return DoRunWithType<at::Half, at::Half>();
|
|
} else {
|
|
CAFFE_THROW("Unsupported inputs");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(Flatten, FlattenOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(Alias, AliasOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(Sum, SumOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<CUDAContext>);
|
|
|
|
CAFFE_KNOWN_TYPE(const float*);
|
|
|
|
REGISTER_CUDA_OPERATOR(EnsureDense, EnsureDenseOp<CUDAContext>);
|
|
|
|
__global__ void NanCheckKernel(int N, const float* X, bool* result) {
|
|
bool has_nan = false;
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
// Note: we have no need to do early return, since only if this fails
|
|
// will we not need to inspect all elements. No need to optimize the
|
|
// case that will fail.
|
|
has_nan = has_nan || isnan(X[i]) || isinf(X[i]);
|
|
}
|
|
__syncthreads();
|
|
if (has_nan) {
|
|
result[0] = true;
|
|
}
|
|
}
|
|
|
|
template <>
|
|
bool NanCheckOp<CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0);
|
|
auto* Y = Output(0);
|
|
const size_t N = X.numel();
|
|
const float* data_ptr = X.data<float>();
|
|
|
|
ReinitializeTensor(&scratch_, {1}, at::dtype<bool>().device(CUDA));
|
|
math::Set<bool, CUDAContext>(
|
|
1, false, scratch_.mutable_data<bool>(), &context_);
|
|
NanCheckKernel<<<
|
|
CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N, X.data<float>(), scratch_.mutable_data<bool>());
|
|
|
|
bool result = false;
|
|
{
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
CUDA_ENFORCE(cudaMemcpyAsync(
|
|
&result,
|
|
scratch_.raw_data(),
|
|
1,
|
|
cudaMemcpyDefault,
|
|
context_.cuda_stream()));
|
|
}
|
|
// Note: we must synchronize here so we can inspect the result
|
|
context_.FinishDeviceComputation();
|
|
|
|
// Print out diagnostic info if we have a NaN or inf
|
|
if (result) {
|
|
std::cerr << "Tensor contained NaN or inf: " << this->debug_def().input(0)
|
|
<< std::endl;
|
|
|
|
for (int j = 0; j < InputSize(); j++) {
|
|
Tensor cpu_X(CPU);
|
|
cpu_X.ResizeLike(Input(j));
|
|
// Hack to cause allocaiton happen here, so it won't happen
|
|
// when we do CopyFrom. We need the mutex then because host->gpu
|
|
// copies seem to possibly lock with NCCL.
|
|
cpu_X.mutable_data<float>();
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
cpu_X.CopyFrom(Input(j)); // sync copy
|
|
}
|
|
std::cerr << "Input tensor: " << j << ": [" << this->debug_def().input(j)
|
|
<< "]" << std::endl;
|
|
tensorPrinter_.Print<float>(cpu_X);
|
|
|
|
if (j == 0) {
|
|
std::cerr << "NaN idxs:" << std::endl;
|
|
auto* cpu_X_data = cpu_X.data<float>();
|
|
for (size_t i = 0; i < cpu_X.numel(); ++i) {
|
|
if (std::isnan(cpu_X_data[i]) || std::isinf(cpu_X_data[i])) {
|
|
std::cerr << i << " ";
|
|
}
|
|
}
|
|
}
|
|
std::cerr << std::endl;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// This op should act as an identity matrix if we don't find any NaNs/infs.
|
|
// Copy over the data if we are not doing this in-place.
|
|
if (&X != Y) {
|
|
Y->CopyFrom(X, true /*async*/);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);
|
|
|
|
/**
|
|
* @brief Update slices of Y in-place with a batch of weighted X's.
|
|
* Y[idx] = alpha[b] * X[b][i] + Y[idx]
|
|
* i=0,...,N-1
|
|
* b=0,...,B-1
|
|
* idx=Indices[i]
|
|
*/
|
|
template <typename T_INDEX>
|
|
__global__ void AxpySliceKernel(
|
|
const float* weight0,
|
|
const int64_t N,
|
|
const int64_t B,
|
|
const int64_t slice_size,
|
|
const float** alpha,
|
|
const float** X,
|
|
const T_INDEX* Indices,
|
|
float* Y,
|
|
const int64_t M) {
|
|
// This implementation requires that the first weight is 1.0
|
|
CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
|
|
for (int i = blockIdx.x; i < N; i += gridDim.x) {
|
|
T_INDEX idx = Indices[i];
|
|
float* y_offset = Y + (idx * slice_size);
|
|
for (int b = 0; b < B; b++) {
|
|
float a = *alpha[b];
|
|
const float* x_offset = X[b] + (i * slice_size);
|
|
for (int j = threadIdx.x; j < slice_size; j += blockDim.x) {
|
|
atomicAdd(&y_offset[j], a * x_offset[j]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
bool ScatterWeightedSumOp<float, CUDAContext>::RunOnDevice() {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
|
|
}
|
|
|
|
template <>
|
|
template <typename Index>
|
|
bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
|
|
CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
|
|
auto& X0 = Input(0);
|
|
auto& weight0 = Input(1);
|
|
auto& indices = Input(2);
|
|
auto* output = Output(0);
|
|
|
|
CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");
|
|
CAFFE_ENFORCE_GT(X0.numel(), 0);
|
|
CAFFE_ENFORCE_GT(X0.dim(), 0, "X0 has to be at least the vector");
|
|
CAFFE_ENFORCE_EQ(weight0.numel(), 1);
|
|
|
|
int64_t M = X0.numel();
|
|
int64_t N = X0.dim(0);
|
|
int64_t K = indices.numel();
|
|
int64_t block_size = M / N;
|
|
|
|
float* data = output->template mutable_data<float>();
|
|
|
|
// In order to have all device pointers of x_i (and weight_i similarly)
|
|
// consecutively in device memory, copy pointers to a host vector and then
|
|
// copy back into a device array.
|
|
const int64_t B = (InputSize() - 3) / 2;
|
|
ReinitializeTensor(&x_data_host_, {B}, at::dtype<const float*>().device(CPU));
|
|
ReinitializeTensor(&weights_host_, {B}, at::dtype<const float*>().device(CPU));
|
|
ReinitializeTensor(&x_data_device_, {B}, at::dtype<const float*>().device(CUDA));
|
|
ReinitializeTensor(&weights_device_, {B}, at::dtype<const float*>().device(CUDA));
|
|
|
|
const float** x_data_host = x_data_host_.mutable_data<const float*>();
|
|
const float** weights_host = weights_host_.mutable_data<const float*>();
|
|
const float** x_data_device = x_data_device_.mutable_data<const float*>();
|
|
const float** weights_device = weights_device_.mutable_data<const float*>();
|
|
|
|
for (int inp = 3; inp < InputSize(); inp += 2) {
|
|
int idx = (inp - 3) / 2;
|
|
x_data_host[idx] = static_cast<const float*>(Input(inp).raw_data());
|
|
weights_host[idx] = static_cast<const float*>(Input(inp + 1).raw_data());
|
|
}
|
|
context_.Copy<const float*, CPUContext, CUDAContext>(
|
|
B, x_data_host, x_data_device);
|
|
context_.Copy<const float*, CPUContext, CUDAContext>(
|
|
B, weights_host, weights_device);
|
|
|
|
AxpySliceKernel<<<
|
|
std::min<int64_t>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
weight0.template data<float>(),
|
|
K,
|
|
B,
|
|
block_size,
|
|
weights_device,
|
|
x_data_device,
|
|
indices.template data<Index>(),
|
|
data,
|
|
M);
|
|
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(
|
|
ScatterWeightedSum,
|
|
ScatterWeightedSumOp<float, CUDAContext>);
|
|
|
|
namespace {
|
|
|
|
template <typename Index, typename T>
|
|
__global__ void scatter_assign_kernel(
|
|
T* data,
|
|
const Index* idxs,
|
|
const T* slicesData,
|
|
int64_t N,
|
|
int64_t K,
|
|
int64_t block_size) {
|
|
for (int64_t i = blockIdx.x; i < K; i += gridDim.x) {
|
|
Index idx = idxs[i];
|
|
CUDA_KERNEL_ASSERT(0 <= idx && idx < N);
|
|
const T* src = slicesData + block_size * i;
|
|
T* dest = data + block_size * idx;
|
|
for (int64_t j = threadIdx.x; j < block_size; j += blockDim.x) {
|
|
dest[j] = src[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
template <typename Index, typename T>
|
|
void ScatterAssignOp<CUDAContext>::DoScatterAssign(
|
|
T* data,
|
|
const Index* idxs,
|
|
const T* slicesData,
|
|
int64_t N,
|
|
int64_t K,
|
|
int64_t block_size) {
|
|
scatter_assign_kernel<<<
|
|
std::min(K, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(data, idxs, slicesData, N, K, block_size);
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(ScatterAssign, ScatterAssignOp<CUDAContext>);
|
|
|
|
REGISTER_CUDA_OPERATOR(Size, SizeOp<CUDAContext>);
|
|
|
|
template <typename T>
|
|
__global__ void RangeKernel(const int n, T* Y, T offset, T step) {
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
Y[index] = index * step + offset;
|
|
}
|
|
}
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool RangeOp<CUDAContext>::DoRunOnDevice(
|
|
const T& start,
|
|
const T& step,
|
|
Tensor* output) {
|
|
int N = output->numel();
|
|
RangeKernel<<<
|
|
CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
N, output->template mutable_data<T>(), start, step);
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(Range, RangeOp<CUDAContext>);
|
|
} // namespace caffe2
|