mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/35800 This PR includes the following changes: * Introduce a new `Expr` type `Buf`: it plays a similar to `Var` role, but also has dimensions. * Use the new `Buf` class in `Store` and `Load` instead of `Var` for specifying where to store to or load from. `Buf` contains the dimensions info of the buffer we're loading/storing to and hence we are able to keep N-d indexes without flattening them into a 1-d index ([x,y] vs [x+y*W]). * Flattening of the indexes is now a separate pass that is executed in `LoopNest::prepareForCodegen` - backends still expect indexes to be flattened, and this PR preserves that. * `Tensor` now contains a `Buf` instead of `Var`, and thus Tensor now has the dimensions info (previously it was a property of a `Function`, not a `Tensor`). This brings us closer to Tensor being a combination of Buffer + Function, where Buffer specifies iteration domain and the Function defines a computation. TODOs: * Consider merging `Buffer` with `Buf` or `BufHandle`. It seems that we don't need all of them. * Harden the logic of how we create buffers in fuser pass. Currently it seems that sometimes we don't set dimensions. * Use `Buf` in `Allocate` and `Free`. * Make it clearer that `Function` doesn't "own" dimensions info and that dimensions are a property of a Tensor, not a Function. Differential Revision: D20789005 Test Plan: Imported from OSS Reviewed By: zheng-xq Pulled By: ZolotukhinM fbshipit-source-id: e04188d1d297f195f1c46669c614557d6bb6cde4
338 lines
9.1 KiB
C++
338 lines
9.1 KiB
C++
#ifdef USE_CUDA
|
|
|
|
#include <sstream>
|
|
#include <stdexcept>
|
|
#include "test/cpp/tensorexpr/test_base.h"
|
|
|
|
#include <cmath>
|
|
|
|
#include "test/cpp/tensorexpr/padded_buffer.h"
|
|
#include "torch/csrc/jit/tensorexpr/buffer.h"
|
|
#include "torch/csrc/jit/tensorexpr/cuda_codegen.h"
|
|
#include "torch/csrc/jit/tensorexpr/loopnest.h"
|
|
#include "torch/csrc/jit/tensorexpr/tensor.h"
|
|
|
|
#include <c10/cuda/CUDACachingAllocator.h>
|
|
#include <c10/util/Half.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
using namespace torch::jit::tensorexpr;
|
|
using namespace torch::jit::tensorexpr;
|
|
|
|
template <typename ctype>
|
|
void testCudaTestVectorAdd01_impl() {
|
|
KernelScope kernel_scope;
|
|
const int num_iter = 3;
|
|
const int block_count = 16;
|
|
const int block_size = 128;
|
|
Dtype dtype = ToDtype<ctype>();
|
|
Buffer a_buf("a", dtype, {num_iter, block_count, block_size});
|
|
Buffer b_buf("b", dtype, {num_iter, block_count, block_size});
|
|
Tensor* c = Compute(
|
|
"c",
|
|
{
|
|
{num_iter, "n"},
|
|
{block_count, "b_id"},
|
|
{block_size, "t_id"},
|
|
},
|
|
[&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
|
|
return a_buf(n, b_id, t_id) + b_buf(n, b_id, t_id);
|
|
});
|
|
LoopNest l({c});
|
|
std::vector<For*> loops = l.getLoopStmtsFor(c);
|
|
l.setGPUBlockIndex(loops[1], 0);
|
|
l.setGPUThreadIndex(loops[2], 0);
|
|
l.prepareForCodegen();
|
|
Stmt* stmt = l.root_stmt();
|
|
CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf);
|
|
const int N = block_count * block_size * num_iter;
|
|
PaddedBuffer<ctype> a_v(N);
|
|
PaddedBuffer<ctype> b_v(N);
|
|
PaddedBuffer<ctype> c_v(N);
|
|
PaddedBuffer<ctype> c_ref(N);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
a_v(i) = ctype(i);
|
|
b_v(i) = ctype(i * 3 + 7);
|
|
c_ref(i) = a_v(i) + b_v(i);
|
|
}
|
|
|
|
// TODO: move gpu support into PaddedBuffer
|
|
ctype* a_dev = nullptr;
|
|
cudaMalloc(&a_dev, N * sizeof(ctype));
|
|
ctype* b_dev = nullptr;
|
|
cudaMalloc(&b_dev, N * sizeof(ctype));
|
|
ctype* c_dev = nullptr;
|
|
cudaMalloc(&c_dev, N * sizeof(ctype));
|
|
cudaMemcpy(a_dev, a_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(b_dev, b_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(c_dev, c_v.data(), N * sizeof(ctype), cudaMemcpyHostToDevice);
|
|
cudaDeviceSynchronize();
|
|
|
|
cuda_cg(c_dev, a_dev, b_dev);
|
|
|
|
cudaDeviceSynchronize();
|
|
cudaMemcpy(c_v.data(), c_dev, N * sizeof(ctype), cudaMemcpyDeviceToHost);
|
|
cudaDeviceSynchronize();
|
|
|
|
ExpectAllNear(c_v, c_ref, 1e-5);
|
|
|
|
cudaFree(a_dev);
|
|
cudaFree(b_dev);
|
|
cudaFree(c_dev);
|
|
}
|
|
|
|
void testCudaTestVectorAdd01() {
|
|
// floating types.
|
|
testCudaTestVectorAdd01_impl<float>();
|
|
testCudaTestVectorAdd01_impl<at::Half>();
|
|
testCudaTestVectorAdd01_impl<double>();
|
|
|
|
// integer types.
|
|
testCudaTestVectorAdd01_impl<int8_t>();
|
|
testCudaTestVectorAdd01_impl<uint8_t>();
|
|
testCudaTestVectorAdd01_impl<int16_t>();
|
|
testCudaTestVectorAdd01_impl<int32_t>();
|
|
testCudaTestVectorAdd01_impl<int64_t>();
|
|
}
|
|
|
|
static void testCudaTestVectorAdd02_impl(int N, int block_size) {
|
|
KernelScope kernel_scope;
|
|
Buffer a_buf("a", kFloat, {N});
|
|
Buffer b_buf("b", kFloat, {N});
|
|
Tensor* c = Compute(
|
|
"c",
|
|
{
|
|
{N, "N"},
|
|
},
|
|
[&](const VarHandle& n) { return a_buf(n) + b_buf(n); });
|
|
LoopNest l({c});
|
|
For* n_outer;
|
|
For* n_inner;
|
|
std::vector<For*> loops = l.getLoopStmtsFor(c);
|
|
l.splitWithMask(loops[0], block_size, &n_outer, &n_inner);
|
|
l.setGPUBlockIndex(n_outer, 0);
|
|
l.setGPUThreadIndex(n_inner, 0);
|
|
l.prepareForCodegen();
|
|
Stmt* stmt = l.root_stmt();
|
|
CudaCodeGen cuda_cg(stmt, c, a_buf, b_buf);
|
|
PaddedBuffer<float> a_v(N);
|
|
PaddedBuffer<float> b_v(N);
|
|
PaddedBuffer<float> c_v(N);
|
|
PaddedBuffer<float> c_ref(N);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
a_v(i) = i;
|
|
b_v(i) = i * 3 + 7;
|
|
c_ref(i) = a_v(i) + b_v(i);
|
|
}
|
|
|
|
// TODO: move gpu support into PaddedBuffer
|
|
float* a_dev = nullptr;
|
|
cudaMalloc(&a_dev, N * sizeof(float));
|
|
float* b_dev = nullptr;
|
|
cudaMalloc(&b_dev, N * sizeof(float));
|
|
float* c_dev = nullptr;
|
|
cudaMalloc(&c_dev, N * sizeof(float));
|
|
cudaMemcpy(a_dev, a_v.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(b_dev, b_v.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(c_dev, c_v.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaDeviceSynchronize();
|
|
|
|
cuda_cg(c_dev, a_dev, b_dev);
|
|
|
|
cudaDeviceSynchronize();
|
|
cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost);
|
|
cudaDeviceSynchronize();
|
|
|
|
ExpectAllNear(c_v, c_ref, 1e-5);
|
|
|
|
cudaFree(a_dev);
|
|
cudaFree(b_dev);
|
|
cudaFree(c_dev);
|
|
}
|
|
|
|
void testCudaTestVectorAdd02() {
|
|
testCudaTestVectorAdd02_impl(1024, 128);
|
|
testCudaTestVectorAdd02_impl(1030, 128);
|
|
}
|
|
|
|
void testCudaDynamicShape2D() {
|
|
KernelScope kernel_scope;
|
|
auto testWithSize = [](int32_t M, int32_t N) {
|
|
VarHandle m("m", kInt);
|
|
VarHandle n("n", kInt);
|
|
Buffer a(BufHandle("a", {m, n}), kFloat);
|
|
Buffer b(BufHandle("b", {m, n}), kFloat);
|
|
Tensor* c = Compute(
|
|
"c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
|
|
return a(i, j) + b(i, j);
|
|
});
|
|
LoopNest l({c});
|
|
l.prepareForCodegen();
|
|
Stmt* s = l.root_stmt();
|
|
CudaCodeGen cg(s, {a, b, c, m, n});
|
|
|
|
std::vector<float> aData(M * N, 1.0f);
|
|
std::vector<float> bData(M * N, 2.0f);
|
|
std::vector<float> cData(M * N, 0.0f);
|
|
float* aDev = nullptr;
|
|
float* bDev = nullptr;
|
|
float* cDev = nullptr;
|
|
cudaMalloc(&aDev, aData.size() * sizeof(aData[0]));
|
|
cudaMalloc(&bDev, bData.size() * sizeof(bData[0]));
|
|
cudaMalloc(&cDev, cData.size() * sizeof(cData[0]));
|
|
cudaMemcpy(
|
|
aDev,
|
|
aData.data(),
|
|
aData.size() * sizeof(aData[0]),
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(
|
|
bDev,
|
|
bData.data(),
|
|
bData.size() * sizeof(bData[0]),
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(
|
|
cDev,
|
|
cData.data(),
|
|
cData.size() * sizeof(cData[0]),
|
|
cudaMemcpyHostToDevice);
|
|
cudaDeviceSynchronize();
|
|
|
|
cg.call({aDev, bDev, cDev, M, N});
|
|
cudaDeviceSynchronize();
|
|
|
|
cudaMemcpy(
|
|
cData.data(),
|
|
cDev,
|
|
cData.size() * sizeof(cData[0]),
|
|
cudaMemcpyDeviceToHost);
|
|
cudaDeviceSynchronize();
|
|
|
|
ExpectAllNear(cData, std::vector<float>(M * N, 3.0f), 1e-7);
|
|
|
|
cudaFree(aDev);
|
|
cudaFree(bDev);
|
|
cudaFree(cDev);
|
|
};
|
|
testWithSize(32, 32);
|
|
testWithSize(1, 16);
|
|
testWithSize(27, 13);
|
|
}
|
|
|
|
void testCudaTestRand01() {
|
|
KernelScope kernel_scope;
|
|
const int num_iter = 3;
|
|
const int block_count = 16;
|
|
const int block_size = 128;
|
|
Tensor* c = Compute(
|
|
"c",
|
|
{
|
|
{num_iter, "n"},
|
|
{block_count, "b_id"},
|
|
{block_size, "t_id"},
|
|
},
|
|
[&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
|
|
return Intrinsics::make(IntrinsicsOp::kRand, kFloat);
|
|
});
|
|
LoopNest l({c});
|
|
std::vector<For*> loops = l.getLoopStmtsFor(c);
|
|
l.setGPUBlockIndex(loops[1], 0);
|
|
l.setGPUThreadIndex(loops[2], 0);
|
|
l.prepareForCodegen();
|
|
Stmt* stmt = l.root_stmt();
|
|
CudaCodeGen cuda_cg(stmt, c);
|
|
const int N = block_count * block_size * num_iter;
|
|
PaddedBuffer<float> c_v(N);
|
|
|
|
// TODO: move gpu support into PaddedBuffer
|
|
float* c_dev = nullptr;
|
|
cudaMalloc(&c_dev, N * sizeof(float));
|
|
cudaDeviceSynchronize();
|
|
|
|
cuda_cg(c_dev);
|
|
|
|
cudaDeviceSynchronize();
|
|
cudaMemcpy(c_v.data(), c_dev, N * sizeof(float), cudaMemcpyDeviceToHost);
|
|
cudaDeviceSynchronize();
|
|
|
|
float sum1 = 0;
|
|
float sum2 = 0;
|
|
float sum3 = 0;
|
|
for (int i = 0; i < N; i++) {
|
|
float v = c_v.data()[i];
|
|
sum1 += v;
|
|
sum2 += v * v;
|
|
sum3 += v * v * v;
|
|
ASSERT_TRUE(v >= 0 && v < 1, "invalid value: ", i, ", ", v);
|
|
}
|
|
sum1 /= N;
|
|
sum2 /= N;
|
|
sum3 /= N;
|
|
float sum1_mean = 1.f / 2;
|
|
float sum2_mean = 1.f / 3;
|
|
float sum3_mean = 1.f / 4;
|
|
|
|
ASSERT_NEAR(sum1, sum1_mean, 2e-2);
|
|
ASSERT_NEAR(sum2, sum2_mean, 2e-2);
|
|
ASSERT_NEAR(sum3, sum3_mean, 2e-2);
|
|
cudaFree(c_dev);
|
|
}
|
|
|
|
void testCudaDynamicShapeSplit() {
|
|
KernelScope ks;
|
|
constexpr int N = 4096;
|
|
VarHandle n("n", kInt);
|
|
Buffer a(BufHandle("a", {n}), kFloat);
|
|
Tensor* b =
|
|
Compute("b", {{n, "n"}}, [&](const VarHandle& i) { return a(i) * 2.0f; });
|
|
LoopNest l({b});
|
|
For* outer;
|
|
For* inner;
|
|
std::vector<For*> loops = l.getLoopStmtsFor(b);
|
|
l.splitWithMask(loops[0], 1024, &outer, &inner);
|
|
l.setGPUBlockIndex(outer, 0);
|
|
l.setGPUThreadIndex(inner, 0);
|
|
Stmt* s = l.root_stmt();
|
|
CudaCodeGen cg(s, {a, b, n});
|
|
|
|
std::vector<float> aData(N, 1.0f);
|
|
std::vector<float> bData(N, 1.0f);
|
|
float* aDev = nullptr;
|
|
float* bDev = nullptr;
|
|
cudaMalloc(&aDev, aData.size() * sizeof(aData[0]));
|
|
cudaMalloc(&bDev, bData.size() * sizeof(bData[0]));
|
|
cudaMemcpy(
|
|
aDev,
|
|
aData.data(),
|
|
aData.size() * sizeof(aData[0]),
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(
|
|
bDev,
|
|
bData.data(),
|
|
bData.size() * sizeof(aData[0]),
|
|
cudaMemcpyHostToDevice);
|
|
cudaDeviceSynchronize();
|
|
|
|
cg.call({aDev, bDev, N});
|
|
cudaDeviceSynchronize();
|
|
|
|
cudaMemcpy(
|
|
bData.data(),
|
|
bDev,
|
|
bData.size() * sizeof(aData[0]),
|
|
cudaMemcpyDeviceToHost);
|
|
cudaDeviceSynchronize();
|
|
|
|
ExpectAllNear(bData, std::vector<float>(N, 2.0f), 1e-7);
|
|
|
|
cudaFree(aDev);
|
|
cudaFree(bDev);
|
|
}
|
|
|
|
} // namespace jit
|
|
} // namespace torch
|
|
|
|
#endif
|