pytorch/torch/csrc/jit/fusion_compiler.cpp
Zachary DeVito e43ff32192
Add a JIT interpreter (#3634)
* Add a JIT interpreter

The separate interpreter is used to graphs with a lower overhead than
converting them to autograd graphs. Some notes:

* does not support Handles/PythonOp/CppOp, these will be in a future commit
* jit_closure.cpp still exists and we fall back to it for now when
  cannot handle something because of PythonOp/CppOp
* In order to support retain_graph=True, the interpreter can be cloned,
  creating a copy that can be run with different arguments. This is
  assumed to be the non-standard case so cloning is not particularly optimized.
  No tensor _data_ is copied, but the at::Tensor list in the interpreter is.
  If we hit problems, there is a lot we could do (such as register allocation)
  to minimize the stuff that needs to be copied.
* Uses a pImpl pattern to keep implementation details out of its header file.
* Modifies the way getTensorOp works so that it reads/writes to already-existing
  vectors, this prevents needing to realloc these buffers each time.
* Timings are here: https://gist.github.com/zdevito/5a20ac29fb1b9e449e693b67dc478127
  This reduces overhead to about the same as running it in python.
  It is about 10us faster to run the same thing using ATen directly.

* Code Mod

Interpreter -> InterpreterState
Function -> Code

Add other requested comments.

* RegList -> ListHandle<T>

Change the RegList functions to be safer by identifying the type of
each argument list, and checking that list insert does not try
to add to two different lists at once.

* Use exactly equal for interp tests
2017-11-13 22:09:53 -08:00

562 lines
18 KiB
C++

#include "torch/csrc/jit/fusion_compiler.h"
#include "torch/csrc/jit/ir.h"
#include "torch/csrc/jit/code_template.h"
#include "torch/csrc/jit/resource_guard.h"
#include "torch/csrc/utils/disallow_copy.h"
#include "ATen/ATen.h"
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <string>
#include <algorithm>
#include <unordered_map>
#include <vector>
#include <sstream>
#include <iostream>
namespace torch { namespace jit {
std::unordered_map<NodeKind, std::string> simple_map_ops = {
// unary
{kabs, "absf(${0})"},
{ksigmoid, "1.f / (1.f + expf(-${0}))"},
{klog, "logf(${0})"},
{klog1p, "log1pf(${0})"},
{klgamma, "lgammaf(${0})"},
{kexp, "expf(${0})"},
{kcos, "cosf(${0})"},
{kacos, "acosf(${0})"},
{kcosh, "coshf(${0})"},
{ksin, "sinf(${0})"},
{kasin, "asinf(${0})"},
{ksinh, "sinhf(${0})"},
{ktan, "tanf(${0})"},
{katan, "atanf(${0})"},
{ktanh, "tanhf(${0})"},
{ksqrt, "sqrtf(${0})"},
{krsqrt, "rsqrtf(${0})"},
{kceil, "ceilf(${0})"},
{kfloor, "floorf(${0})"},
{kround, "roundf(${0})"},
{ktrunc, "truncf(${0})"},
{kfrac, "fracf(${0})"},
{kreciprocal, "reciprocalf(${0})"},
{kneg, "-${0}"},
//simple binary
{katan2, "atan2(${0}, ${1})"},
{kmin, "fminf(${0}, ${1})"},
{kmax, "fmaxf(${0}, ${1})"},
//binary with other
// TODO: some of these ops will not get generated because
// we only work on float inputs/outputs, but they are here to record
// that they are valid mappable ops once we handle more type
{k__and__, "${0} && ${1}"},
{k__lshift__, "${0} << ${1}"},
{k__or__, "${0} || ${1}"},
{k__rshift__, "${0} >> ${1}"},
{k__xor__, "${0} ^ ${1}"},
{kdiv, "${0} / ${1}"},
{keq, "${0} == ${1}"},
{kfmod, "fmodf(${0}, ${1})"},
{kge, "${0} >= ${1})"},
{kgt, "${0} > ${1}"},
{kle, "${0} <= ${1})"},
{klt, "${0} < ${1}"},
{kmul, "${0} * ${1}"},
{kne, "${0} != ${1}"},
{kremainder, "remainderf(${0}, ${1})"},
{kpow, "powf(${0}, ${1})"},
//alpha
{kadd, "${0} + ${alpha}*${1}"},
{ksub, "${0} - ${alpha}*${1})"},
// special
{klerp, "${0} + ${weight}*(${1} - ${0})"},
{kclamp, "min(max(${0},${min}),${max})"},
};
std::vector<bool> TensorDesc::findContiguous(
const at::IntList& sizes,
const at::IntList& strides) {
JIT_ASSERT(sizes.size() == strides.size());
std::vector<bool> cont(sizes.size());
for(size_t i = 0; i < sizes.size(); ++i) {
int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
cont[i] = strides[i] == expected_stride;
}
return cont;
}
namespace {
static int ceilDiv(int a, int b) {
return (a + b - 1) / b;
}
std::ostream& operator<<(std::ostream & out, const TensorDesc & d) {
out << d.scalar_type << "[";
for(auto b : d.contiguity)
out << b << ";";
out << "]";
return out;
}
// We're using three CUDA APIs, so define a few helpers for error handling
static void nvrtcCheck(nvrtcResult result,const char * file, int line) {
if(result != NVRTC_SUCCESS) {
std::stringstream ss;
ss << file << ":" << line << ": " << nvrtcGetErrorString(result);
throw std::runtime_error(ss.str());
}
}
#define JIT_NVRTC_CHECK(result) nvrtcCheck(result,__FILE__,__LINE__);
static void cuCheck(CUresult result, const char * file, int line) {
if(result != CUDA_SUCCESS) {
const char * str;
cuGetErrorString(result, &str);
std::stringstream ss;
ss << file << ":" << line << ": " << str;
throw std::runtime_error(ss.str());
}
}
#define JIT_CU_CHECK(result) cuCheck(result,__FILE__,__LINE__);
static void cudaCheck(cudaError_t result, const char * file, int line) {
if(result != cudaSuccess) {
std::stringstream ss;
ss << file << ":" << line << ": " << cudaGetErrorString(result);
throw std::runtime_error(ss.str());
}
}
#define JIT_CUDA_CHECK(result) cudaCheck(result,__FILE__,__LINE__);
////////////////////////////////////////////////////////////////////////////////
// Code generation
namespace codegen {
auto compilation_unit_template = CodeTemplate(R"(
typedef ${IndexType} IndexType;
template<typename T, size_t N>
struct TensorInfo {
T * data;
IndexType sizes[N];
IndexType strides[N];
};
extern "C" __global__
void ${kernelName}(IndexType totalElements, ${formals}) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert `linearIndex` into an offset of tensor:
${tensorOffsets}
// calculate the results
${kernelBody}
}
}
)");
// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
// linearId /= sizes[i];
auto dim_calc = CodeTemplate(R"(
//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
)");
void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
TemplateEnv env;
env.s("tensor",tensor);
out << format("IndexType ${tensor}_offset = 0;\n",env);
out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
for(int d = ndim - 1; d >= 0; --d) {
env.d("d",d);
env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
format("* ${tensor}.strides[${d}]",env) : "");
out << dim_calc.format(env);
if(d > 0) {
out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
}
}
}
std::string nodeName(Node * n) {
return "n" + std::to_string(n->unique());
}
std::string scalarValue(const at::Tensor & t) {
auto s = at::Scalar(t);
return (s.isIntegral()) ?
std::to_string(s.toLong()) :
std::to_string(s.toDouble());
}
const char * scalarTypeName(at::ScalarType type) {
switch(type) {
#define DEFINE_CASE(ctype,name,_) \
case at::ScalarType::name: return #ctype;
AT_FORALL_SCALAR_TYPES(DEFINE_CASE)
#undef DEFINE_CASE
default:
throw new std::runtime_error("unknown scalar type");
}
}
std::string encodeRHS(Node * n) {
TemplateEnv env;
size_t i = 0;
for(auto in : n->inputs()) {
env.s(std::to_string(i++),nodeName(in));
}
// ops like div have a / b or a / 2 with the constant having the attribute other
// so we add other as an input if it is present
// 'pow' is the same but uses exponent as the attribute, so we handle that here as well
if(n->hasAttribute(kother) || n->hasAttribute(kexponent)) {
env.s(std::to_string(i), scalarValue(n->t(kother)));
}
// we also add any other scalar tensors to the env for special ops
for(auto a : n->attributeNames()) {
if(n->kindOf(a) == AttributeKind::t) {
auto v = n->t(a);
if(v.dim() == 0) {
env.s(symbolToString(a), scalarValue(v));
}
}
}
const auto & str = simple_map_ops.at(n->kind());
return format(str, env);
}
std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
const std::string & name,
AnnotatedGraph & agraph) {
Graph& subgraph = *agraph.graph;
TemplateEnv env;
env.s("kernelName",name);
// TODO: handle cases where we need to generate > 2^32 element tensors
env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
std::stringstream body;
std::stringstream tensorOffsets;
std::vector<std::string> formals;
auto emitFormal = [&](Node * n, const TensorDesc & desc) {
std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
size_t nDim = desc.nDim();
emitIndexingFor(tensorOffsets, tensor, nDim, desc.lastIsContiguous());
env.s("tensor",tensor);
env.d("nDim",nDim);
env.s("scalar_type",scalarTypeName(desc.scalar_type));
formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env));
};
{
size_t i = 0;
for(auto p : subgraph.inputs())
emitFormal(p,agraph.input_desc[i++]);
}
std::vector<ConcatDesc> concat_desc;
std::vector<Node*> flat_output_nodes;
{
size_t i = 0;
for(auto o : subgraph.outputs()) {
auto & desc = agraph.output_desc[i++];
if(o->kind() != kcat) {
emitFormal(o, desc);
concat_desc.emplace_back();
flat_output_nodes.push_back(o);
} else {
size_t nInputs = o->inputs().size();
concat_desc.emplace_back(desc, nInputs, o->i(kdim));
for(auto c : o->inputs()) {
emitFormal(c, *concat_desc.back().subtensorDesc);
flat_output_nodes.push_back(c);
}
}
}
}
size_t formal_count = 0;
for(auto p : subgraph.inputs()) {
env.s("node",nodeName(p));
env.d("formal",formal_count++);
env.s("access",format("t${formal}.data[t${formal}_offset]",env));
//TODO: actual type propagation rather than relying on auto..
body << format("auto ${node} = ${access};\n",env);
}
for(auto n : subgraph.nodes()) {
if(n->kind() == kcat)
continue; // Concat nodes by narrowing the output Tensors before the kernel runs
env.s("node",nodeName(n));
env.s("rhs", encodeRHS(n));
body << format("auto ${node} = ${rhs};\n",env);
}
for(auto o : flat_output_nodes) {
env.d("formal",formal_count++);
env.s("access",format("t${formal}.data[t${formal}_offset]",env));
env.s("node",nodeName(o));
body << format("${access} = ${node};\n",env);
}
env.s("tensorOffsets",tensorOffsets.str());
env.s("kernelBody",body.str());
env.v("formals",formals);
out << compilation_unit_template.format(env);
return concat_desc;
}
////////////////////////////////////////////////////////////////////////////////
} // codegen namespace
} // anonymous namespace
// Host-side view of TensorInfo (that visivle for the kernel is defined above).
// Note dims[0] - we need to dynamically allocate the dims.
struct TensorInfo {
void * data;
uint32_t sizes_strides[0];
uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
};
CompiledFusionFunction::CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph)
: name(name)
, input_desc(agraph.input_desc)
, output_desc(agraph.output_desc) {
JIT_CUDA_CHECK(cudaGetDevice(&device));
JIT_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
std::stringstream cu;
concat_desc = codegen::emitCompilationUnit(cu, name, agraph);
compilation_unit = cu.str();
nvrtcProgram program;
JIT_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), NULL, 0, nullptr, nullptr));
std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
std::vector<const char *> args = {"--std=c++11", compute.c_str()};
nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
if (result == NVRTC_ERROR_COMPILATION) {
size_t logsize;
nvrtcGetProgramLogSize(program, &logsize);
std::vector<char> log(logsize);
nvrtcGetProgramLog(program, log.data());
cu << log.data();
throw std::runtime_error(cu.str());
}
ResourceGuard holdProgram([&] {
JIT_NVRTC_CHECK(nvrtcDestroyProgram(&program));
});
JIT_NVRTC_CHECK(result);
size_t ptx_size;
JIT_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
ptx.resize(ptx_size);
JIT_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
JIT_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
JIT_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
JIT_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
&maxBlocks, function, 128, 0));
maxBlocks *= prop.multiProcessorCount;
}
CompiledFusionFunction::~CompiledFusionFunction() {
JIT_CU_CHECK(cuModuleUnload(module));
}
namespace {
// Tries to compress sizes and strides according to cont. Emits the result t
// c_sizes, c_strides and throws an error on failure (if can't compress)
void compressContiguous(
at::IntList sizes,
at::IntList strides,
const std::vector<bool> & cont,
uint32_t * c_sizes,
uint32_t * c_strides) {
size_t compressed_dims = 0;
size_t cur = 0;
size_t ndim = sizes.size();
while(cur < ndim) {
size_t total_size = sizes[cur];
cur++;
while(cont[cur-1] && cur < ndim) {
JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
total_size *= sizes[cur];
cur++;
}
// cur starts pointing at the beginning of run to compress
// cur ends one _after_ the terminating false or end of list.
// total_size is the size of all dimensions [begin,end)
// examples:
// f = not cont.
// t = cont.
// x = don't care, including past end of list
// s = start of cur
// e = end of cur
// f x x x
// s e
// t f x x
// s e
// t t f x
// s e
c_sizes[compressed_dims] = total_size;
c_strides[compressed_dims] = strides[cur-1];
compressed_dims++;
}
JIT_ASSERT(!cont.back() || strides.back() == 1);
}
} // anonymous namespace
void CompiledFusionFunction::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
AutoGPU gpu_guard(inputs);
JIT_ASSERT(inputs.size() == input_desc.size());
JIT_ASSERT(outputs.size() == output_desc.size());
size_t flat_outputs_size = 0;
for(auto & c : concat_desc)
flat_outputs_size += c.nSubtensors;
// XXX: this code assumes that inputs are 32-bit addressable
// XXX: this code assumes that all inputs are of the same size
JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
uint32_t numel = inputs[0].numel();
at::IntList map_size = inputs[0].sizes();
// Compute the storage needed to store TensorInfo structs for inputs and outputs.
size_t uncompressedDim = input_desc.at(0).contiguity.size();
size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (inputs.size() + flat_outputs_size);
std::vector<char> buffer(maxPossibleBufferSize);
char * buffer_next = buffer.data();
// A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
std::vector<void*> arguments;
arguments.reserve(1 + inputs.size() + flat_outputs_size);
// Asserts that t's dims can be compressed in the same way as in desc
// (that's what the kernel assumes), and appends it to the arguments vector.
auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
ti->data = t.data_ptr();
compressContiguous(t.sizes(), t.strides(), desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
buffer_next += maxPossibleTensorInfoSize;
arguments.push_back(ti);
};
arguments.push_back(&numel);
for (std::size_t i = 0; i < input_desc.size(); ++i)
addTensorInfo(input_desc[i], inputs[i]);
for (std::size_t i = 0; i < output_desc.size(); ++i) {
auto & c = concat_desc[i];
at::Tensor o = outputs[i];
if(c.nSubtensors == 1) {
o.resize_(map_size);
addTensorInfo(output_desc[i], outputs[i]);
} else {
size_t small_size = map_size[c.dim];
std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
concat_size[c.dim] = small_size * c.nSubtensors;
o.resize_(concat_size);
size_t offset = 0;
for(size_t j = 0; j < c.nSubtensors; ++j) {
// because the concatenated_output stays live, the underlying data
// in this view remains live through the end of this function
// so there is not need to hold onto this tensor
auto view = o.narrow(c.dim, offset, small_size);
addTensorInfo(*c.subtensorDesc, view);
offset += small_size;
}
}
}
launch(numel, arguments.data());
}
void CompiledFusionFunction::launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs) {
AutoGPU guard(inputs.back());
outputs.clear();
outputs.reserve(outputDescriptors().size());
for(auto & od : outputDescriptors()) {
outputs.push_back(at::CUDA(od.scalar_type).tensor());
}
launch_with_tensors(inputs, outputs);
}
void CompiledFusionFunction::launch(uint32_t numel, void ** arguments) {
int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
//std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
// << " numblocks = " << numBlocks;
// it is possible that this is the first cuda call on this thread
// so make sure we initialize the Driver API's context
// cudaFree(0) accomplishes this.
cudaFree(0);
JIT_CU_CHECK(cuLaunchKernel(
function,
numBlocks, 1, 1,
blockSize, 1, 1,
0, nullptr,
arguments,
nullptr));
}
std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(AnnotatedGraph & agraph) {
std::stringstream key;
key << *agraph.graph << "\n";
int device;
JIT_CUDA_CHECK(cudaGetDevice(&device));
key << "Device " << device << "\n";
for(auto & i : agraph.input_desc)
key << i << "\n";
for(auto & i : agraph.output_desc)
key << i << "\n";
std::string key_ = key.str();
auto it = cache.find(key_);
if (it == cache.end()) {
std::string name = "kernel_" + std::to_string(cache.size());
auto func = std::make_shared<CompiledFusionFunction>(name, agraph);
it = cache.emplace(key_, std::move(func)).first;
}
return it->second;
}
std::shared_ptr<CompiledFusionFunction> FusionCompiler::getOrCompile(Graph & graph) {
AnnotatedGraph agraph { &graph };
for(auto & input : graph.inputs()) {
agraph.input_desc.emplace_back(input->type()->expect<TensorType>());
}
for(auto & output : graph.outputs()) {
agraph.output_desc.emplace_back(output->type()->expect<TensorType>());
}
return getOrCompile(agraph);
}
void FusionCompiler::debugLaunchGraph(Graph & graph, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
AnnotatedGraph agraph { &graph };
for(auto & i : inputs) {
agraph.input_desc.emplace_back(i);
}
for(auto & i : outputs) {
agraph.output_desc.emplace_back(i);
}
auto func = getOrCompile(agraph);
func->launch_with_tensors(inputs, outputs);
}
//TODO: thread safety
FusionCompiler & sharedFusionCompiler() {
static FusionCompiler compiler;
return compiler;
}
}}