mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: How did we get so many uses of `NULL` again? ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11047 Differential Revision: D9566799 Pulled By: goldsborough fbshipit-source-id: 83469f352ac69aa65bdaf1a1a21f922d892e0db3
1621 lines
55 KiB
C++
1621 lines
55 KiB
C++
#ifndef _WIN32
|
|
#include "torch/csrc/jit/fusion_compiler.h"
|
|
|
|
#include "torch/csrc/jit/ir.h"
|
|
#include "torch/csrc/jit/code_template.h"
|
|
#include "torch/csrc/jit/resource_guard.h"
|
|
#include "torch/csrc/jit/constants.h"
|
|
#include "torch/csrc/jit/passes/shape_analysis.h"
|
|
#include "torch/csrc/jit/custom_operator.h"
|
|
|
|
#include "torch/csrc/utils/disallow_copy.h"
|
|
#include "torch/csrc/variable_tensor_functions.h"
|
|
#include "torch/csrc/utils/hash.h"
|
|
#include <torch/csrc/jit/assertions.h>
|
|
|
|
#include "ATen/ATen.h"
|
|
#include "ATen/ExpandUtils.h"
|
|
#include "ATen/WrapDimUtils.h"
|
|
|
|
#ifdef USE_CUDA
|
|
#include "ATen/cuda/CUDAContext.h"
|
|
#include "THC/THC.h"
|
|
#include <THC/THCGenerator.hpp>
|
|
#include "torch/csrc/cuda/cuda_check.h"
|
|
#include <nvrtc.h>
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#endif
|
|
|
|
#include <string>
|
|
#include <algorithm>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <dlfcn.h>
|
|
#include <unistd.h>
|
|
|
|
#ifdef USE_CUDA
|
|
THCGenerator* THCRandom_getGenerator(THCState* state);
|
|
#endif
|
|
|
|
namespace torch { namespace jit {
|
|
|
|
std::vector<bool> TensorDesc::findContiguous(
|
|
const at::IntList& sizes,
|
|
const at::IntList& strides) {
|
|
JIT_ASSERT(sizes.size() == strides.size());
|
|
std::vector<bool> cont(sizes.size());
|
|
for(size_t i = 0; i < sizes.size(); ++i) {
|
|
int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
|
|
cont[i] = strides[i] == expected_stride;
|
|
}
|
|
return cont;
|
|
}
|
|
|
|
// Descriptor for chunk-ing an input tensor into subtensors
|
|
// OR concat-ing an output tensor from subtensors
|
|
struct PartitionDesc {
|
|
size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat
|
|
size_t dim; // dimension along which the chunk/concat occurs
|
|
std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
|
|
PartitionDesc()
|
|
: nSubtensors(1), dim(0) {}
|
|
|
|
PartitionDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
|
|
: nSubtensors(nSubtensors), dim(dim) {
|
|
JIT_ASSERT(nSubtensors > 1);
|
|
std::vector<bool> cont = desc.contiguity;
|
|
if(dim > 0) {
|
|
// when we narrow the concatenated output/chunked input
|
|
// we make the size[dim] smaller while keeping the stride[dim] the same,
|
|
// meaning: stride[dim - 1] != stride[dim]*size[dim]
|
|
// so dim - 1 is no longer contiguous
|
|
cont[dim - 1] = false;
|
|
}
|
|
subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
|
|
}
|
|
|
|
bool isNoop() const {
|
|
return nSubtensors == 1;
|
|
}
|
|
};
|
|
|
|
struct FusedKernel {
|
|
TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
|
|
|
|
FusedKernel(const std::string & name, AnnotatedGraph & agraph);
|
|
virtual ~FusedKernel() = default;
|
|
|
|
// expects outputs to be pre-allocated
|
|
void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
|
|
|
|
// creates new tensors for outputs
|
|
void launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs);
|
|
const std::vector<TensorDesc> & outputDescriptors() const {
|
|
return output_desc;
|
|
}
|
|
protected:
|
|
virtual at::Backend backend() const = 0;
|
|
|
|
// arguments is a list of pointers to the arguments for the compiled CUDA/CPU
|
|
// code.
|
|
// The format of arguments is suitable for directly passing to a call to
|
|
// cuLaunchKernel as the kernel arguments.
|
|
// Currently the first argument is a pointer to numel (for passing to
|
|
// CUDA code), and the remainder are pointers to the TensorInfo<T> structs
|
|
// that compiled code uses to load Tensor data.
|
|
// launch_with_tensors handles packing at::Tensors into this arguments array.
|
|
// CPU code uses the same convension so that launch_with_tensors can be shared.
|
|
virtual void launch_raw(uint32_t numel, void ** arguments) = 0;
|
|
|
|
virtual uint64_t get_rand_offset(uint32_t numel) = 0;
|
|
bool has_random;
|
|
std::string name;
|
|
// We keep these around for debugging
|
|
std::string compilation_unit;
|
|
std::vector<TensorDesc> input_desc;
|
|
std::vector<TensorDesc> output_desc;
|
|
|
|
// same size as output_desc, describes whether
|
|
// an output is actually a concatenation of
|
|
// many subtensors that the fusion group produces
|
|
std::vector<PartitionDesc> concat_desc;
|
|
|
|
// same size as input_desc, describes whether an
|
|
// input should be broken into subtensors (chunks)
|
|
// to be consumed by the fusion group
|
|
std::vector<PartitionDesc> chunk_desc;
|
|
};
|
|
|
|
|
|
namespace {
|
|
|
|
#ifdef USE_CUDA
|
|
|
|
static int ceilDiv(int a, int b) {
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
#endif
|
|
|
|
Node* usedInFusedChunk(Value * input) {
|
|
auto uses = input->uses();
|
|
if (uses.size() == 1) {
|
|
Node *user = uses[0].user;
|
|
if (user->kind() == prim::FusedChunk) {
|
|
return user;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Code generation
|
|
|
|
namespace codegen {
|
|
|
|
/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
|
|
Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
|
|
so typedefs help it handle those cases*/
|
|
|
|
auto type_declarations_template = CodeTemplate(R"(
|
|
#if defined(__CUDACC_RTC__)
|
|
typedef unsigned char uint8_t;
|
|
typedef signed char int8_t;
|
|
typedef short int int16_t;
|
|
typedef long long int int64_t;
|
|
${HalfHeader}
|
|
${RandHeader}
|
|
#endif
|
|
typedef ${IndexType} IndexType;
|
|
template<typename T, size_t N>
|
|
struct TensorInfo {
|
|
T * data;
|
|
IndexType sizes[N];
|
|
IndexType strides[N];
|
|
};
|
|
)");
|
|
|
|
// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
|
|
// curand header correctly.
|
|
constexpr auto rand_support_literal = R"(
|
|
|
|
class Philox {
|
|
public:
|
|
__device__ inline Philox(unsigned long long seed,
|
|
unsigned long long subsequence,
|
|
unsigned long long offset) {
|
|
key.x = (unsigned int)seed;
|
|
key.y = (unsigned int)(seed >> 32);
|
|
counter = make_uint4(0, 0, 0, 0);
|
|
counter.z = (unsigned int)(subsequence);
|
|
counter.w = (unsigned int)(subsequence >> 32);
|
|
STATE = 0;
|
|
incr_n(offset / 4);
|
|
}
|
|
|
|
__device__ inline unsigned long operator()() {
|
|
if(STATE == 0) {
|
|
uint4 counter_ = counter;
|
|
uint2 key_ = key;
|
|
for(int i = 0; i < 9; i++) {
|
|
counter_ = single_round(counter_, key_);
|
|
key_.x += (kPhilox10A); key_.y += (kPhilox10B);
|
|
}
|
|
output = single_round(counter_, key_);
|
|
incr();
|
|
}
|
|
unsigned long ret;
|
|
switch(STATE) {
|
|
case 0: ret = output.x; break;
|
|
case 1: ret = output.y; break;
|
|
case 2: ret = output.z; break;
|
|
case 3: ret = output.w; break;
|
|
}
|
|
STATE = (STATE + 1) % 4;
|
|
return ret;
|
|
}
|
|
|
|
private:
|
|
uint4 counter;
|
|
uint4 output;
|
|
uint2 key;
|
|
unsigned int STATE;
|
|
__device__ inline void incr_n(unsigned long long n) {
|
|
unsigned int nlo = (unsigned int)(n);
|
|
unsigned int nhi = (unsigned int)(n >> 32);
|
|
counter.x += nlo;
|
|
if (counter.x < nlo)
|
|
nhi++;
|
|
counter.y += nhi;
|
|
if (nhi <= counter.y)
|
|
return;
|
|
if (++counter.z)
|
|
return;
|
|
++counter.w;
|
|
}
|
|
__device__ inline void incr() {
|
|
if (++counter.x)
|
|
return;
|
|
if (++counter.y)
|
|
return;
|
|
if (++counter.z)
|
|
return;
|
|
++counter.w;
|
|
}
|
|
__device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
|
|
unsigned int *result_high) {
|
|
*result_high = __umulhi(a, b);
|
|
return a*b;
|
|
}
|
|
|
|
__device__ inline uint4 single_round(uint4 ctr, uint2 key) {
|
|
unsigned int hi0;
|
|
unsigned int hi1;
|
|
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
|
|
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
|
|
|
|
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
|
|
return ret;
|
|
}
|
|
|
|
static const unsigned long kPhilox10A = 0x9E3779B9;
|
|
static const unsigned long kPhilox10B = 0xBB67AE85;
|
|
static const unsigned long kPhiloxSA = 0xD2511F53;
|
|
static const unsigned long kPhiloxSB = 0xCD9E8D57;
|
|
};
|
|
|
|
// Inverse of 2^32.
|
|
#define M_RAN_INVM32 2.3283064e-10f
|
|
__device__ __inline__ float uniform(unsigned int x) {
|
|
return x * M_RAN_INVM32;
|
|
}
|
|
)";
|
|
|
|
constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset";
|
|
constexpr auto rand_init = R"(
|
|
int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
|
Philox rnd(seed, idx, offset);
|
|
)";
|
|
auto cuda_compilation_unit_template = CodeTemplate(R"(
|
|
${type_declarations}
|
|
|
|
extern "C" __global__
|
|
void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
|
|
${RandInit}
|
|
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
linearIndex < totalElements;
|
|
linearIndex += gridDim.x * blockDim.x) {
|
|
// Convert `linearIndex` into an offset of tensor:
|
|
${tensorOffsets}
|
|
// calculate the results
|
|
${kernelBody}
|
|
}
|
|
}
|
|
)");
|
|
|
|
auto cpu_compilation_unit_template = CodeTemplate(R"(
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <math.h>
|
|
${type_declarations}
|
|
|
|
#define OMP_THRESHOLD 100000
|
|
static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
|
|
#pragma omp parallel for if(totalElements > OMP_THRESHOLD)
|
|
for (IndexType linearIndex = 0;
|
|
linearIndex < totalElements;
|
|
linearIndex += 1) {
|
|
// Convert `linearIndex` into an offset of tensor:
|
|
${tensorOffsets}
|
|
// calculate the results
|
|
${kernelBody}
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
void ${kernelName}(IndexType totalElements, void ** args) {
|
|
${kernelName}_kernel(totalElements ${,argument_loads});
|
|
}
|
|
)");
|
|
|
|
// This snippet enables half support in the jit. Following the pattern for
|
|
// reductions, fp16 input data is immediately upconverted to float
|
|
// with __half2float(). All mathematical operations are done on float
|
|
// values, and if needed the intermediate float representation is
|
|
// converted to half with __float2half() when writing to a half tensor.
|
|
constexpr auto half_support_literal = R"(
|
|
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
|
|
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
|
|
#if defined(__cplusplus)
|
|
struct __align__(2) __half {
|
|
__host__ __device__ __half() { }
|
|
|
|
protected:
|
|
unsigned short __x;
|
|
};
|
|
|
|
/* All intrinsic functions are only available to nvcc compilers */
|
|
#if defined(__CUDACC__)
|
|
/* Definitions of intrinsics */
|
|
__device__ __half __float2half(const float f) {
|
|
__half val;
|
|
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
|
|
return val;
|
|
}
|
|
|
|
__device__ float __half2float(const __half h) {
|
|
float val;
|
|
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
|
|
return val;
|
|
}
|
|
#endif /* defined(__CUDACC__) */
|
|
#endif /* defined(__cplusplus) */
|
|
#undef __HALF_TO_US
|
|
#undef __HALF_TO_CUS
|
|
|
|
typedef __half half;
|
|
)";
|
|
|
|
// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
|
|
// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
|
|
// linearId /= sizes[i];
|
|
auto dim_calc = CodeTemplate(R"(
|
|
//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
|
|
size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
|
|
${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
|
|
)");
|
|
|
|
static void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
|
|
TemplateEnv env;
|
|
env.s("tensor",tensor);
|
|
out << format("IndexType ${tensor}_offset = 0;\n",env);
|
|
out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
|
|
for(int d = ndim - 1; d >= 0; --d) {
|
|
env.d("d",d);
|
|
env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
|
|
env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
|
|
format("* ${tensor}.strides[${d}]",env) : "");
|
|
out << dim_calc.format(env);
|
|
if(d > 0) {
|
|
out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
|
|
}
|
|
}
|
|
}
|
|
|
|
static std::string valueName(Value * n) {
|
|
return "n" + std::to_string(n->unique());
|
|
}
|
|
|
|
static std::string scalarValue(int64_t v) {
|
|
return std::to_string(v);
|
|
}
|
|
|
|
static std::string scalarValue(double v) {
|
|
std::ostringstream out;
|
|
out << std::scientific << v << "f";
|
|
return out.str();
|
|
}
|
|
|
|
static const char * scalarTypeName(at::ScalarType type) {
|
|
if (type == at::ScalarType::Half) {
|
|
return "half";
|
|
}
|
|
|
|
switch(type) {
|
|
#define DEFINE_CASE(ctype,name,_) \
|
|
case at::ScalarType::name: return #ctype;
|
|
AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE)
|
|
#undef DEFINE_CASE
|
|
default:
|
|
throw std::runtime_error("unknown scalar type");
|
|
}
|
|
}
|
|
|
|
std::string encodeRHS(Node * n) {
|
|
static std::unordered_map<NodeKind, std::string> simple_map_ops = {
|
|
// unary
|
|
{aten::abs, "absf(${0})"},
|
|
{aten::sigmoid, "1.f / (1.f + expf(-${0}))"},
|
|
{aten::relu, "${0} < 0 ? 0.f : ${0} "},
|
|
{aten::log, "logf(${0})"},
|
|
{aten::log10, "log10f(${0})"},
|
|
{aten::log1p, "log1pf(${0})"},
|
|
{aten::log2, "log2f(${0})"},
|
|
{aten::lgamma, "lgammaf(${0})"},
|
|
{aten::exp, "expf(${0})"},
|
|
{aten::expm1, "expm1f(${0})"},
|
|
{aten::cos, "cosf(${0})"},
|
|
{aten::acos, "acosf(${0})"},
|
|
{aten::cosh, "coshf(${0})"},
|
|
{aten::sin, "sinf(${0})"},
|
|
{aten::asin, "asinf(${0})"},
|
|
{aten::sinh, "sinhf(${0})"},
|
|
{aten::tan, "tanf(${0})"},
|
|
{aten::atan, "atanf(${0})"},
|
|
{aten::tanh, "tanhf(${0})"},
|
|
{aten::sqrt, "sqrtf(${0})"},
|
|
{aten::rsqrt, "rsqrtf(${0})"},
|
|
{aten::ceil, "ceilf(${0})"},
|
|
{aten::floor, "floorf(${0})"},
|
|
{aten::round, "roundf(${0})"},
|
|
{aten::trunc, "truncf(${0})"},
|
|
{aten::frac, "fracf(${0})"},
|
|
{aten::reciprocal, "reciprocalf(${0})"},
|
|
{aten::neg, "-${0}"},
|
|
//simple binary
|
|
{aten::atan2, "atan2(${0}, ${1})"},
|
|
{aten::min, "fminf(${0}, ${1})"},
|
|
{aten::max, "fmaxf(${0}, ${1})"},
|
|
|
|
//binary with other
|
|
// TODO: some of these ops will not get generated because
|
|
// we only work on float inputs/outputs, but they are here to record
|
|
// that they are valid mappable ops once we handle more type
|
|
{aten::__and__, "${0} && ${1}"},
|
|
{aten::__lshift__, "${0} << ${1}"},
|
|
{aten::__or__, "${0} || ${1}"},
|
|
{aten::__rshift__, "${0} >> ${1}"},
|
|
{aten::__xor__, "${0} ^ ${1}"},
|
|
{aten::div, "${0} / ${1}"},
|
|
{aten::eq, "${0} == ${1}"},
|
|
{aten::fmod, "fmodf(${0}, ${1})"},
|
|
{aten::ge, "(${0} >= ${1})"},
|
|
{aten::gt, "${0} > ${1}"},
|
|
{aten::le, "(${0} <= ${1})"},
|
|
{aten::lt, "${0} < ${1}"},
|
|
{aten::type_as, "(${0})"}, //everything is implicitly convertible to float
|
|
{aten::mul, "${0} * ${1}"},
|
|
{aten::ne, "${0} != ${1}"},
|
|
{aten::remainder, "remainderf(${0}, ${1})"},
|
|
{aten::pow, "powf(${0}, ${1})"},
|
|
|
|
//alpha
|
|
{aten::add, "${0} + ${2}*${1}"},
|
|
{aten::sub, "(${0} - ${2}*${1})"},
|
|
{aten::rand_like, "uniform(rnd())"},
|
|
|
|
// simple derivatives
|
|
{aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
|
|
{aten::_tanh_backward, "${0} * (1.f - ${1} * ${1})"},
|
|
};
|
|
|
|
if (n->kind() == prim::Constant) {
|
|
auto val = toIValue(n->output()).value();
|
|
if (val.isDouble()) {
|
|
return scalarValue(val.toDouble());
|
|
} else {
|
|
JIT_ASSERT(val.isInt());
|
|
return scalarValue(val.toInt());
|
|
}
|
|
}
|
|
|
|
TemplateEnv env;
|
|
size_t i = 0;
|
|
for(auto in : n->inputs()) {
|
|
env.s(std::to_string(i++), valueName(in));
|
|
}
|
|
|
|
const auto & str = simple_map_ops.at(n->kind());
|
|
return format(str, env);
|
|
}
|
|
|
|
// Returns: (input chunk metadata, output concat metadata, is_random)
|
|
std::tuple<std::vector<PartitionDesc>,std::vector<PartitionDesc>,bool> emitCompilationUnit(
|
|
std::ostream& out,
|
|
const std::string& name,
|
|
AnnotatedGraph& agraph,
|
|
bool use_cuda) {
|
|
bool has_random = false;
|
|
Graph& subgraph = *agraph.graph;
|
|
TemplateEnv env;
|
|
env.s("kernelName",name);
|
|
// TODO: handle cases where we need to generate > 2^32 element tensors
|
|
env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
|
|
|
|
std::stringstream body;
|
|
std::stringstream tensorOffsets;
|
|
std::vector<std::string> formals;
|
|
std::vector<std::string> argument_loads;
|
|
auto emitFormal = [&](Value * n, const TensorDesc & desc) {
|
|
std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
|
|
size_t nDim = desc.nDim();
|
|
emitIndexingFor(tensorOffsets, tensor, nDim, desc.lastIsContiguous());
|
|
env.s("tensor",tensor);
|
|
env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex
|
|
env.d("nDim",nDim);
|
|
env.s("scalar_type",scalarTypeName(desc.scalar_type));
|
|
formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env));
|
|
argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])",env));
|
|
};
|
|
|
|
std::vector<PartitionDesc> chunk_desc;
|
|
std::vector<std::pair<Value*,TensorDesc&>> flat_inputs;
|
|
{
|
|
size_t input_index = 0;
|
|
for(auto p : subgraph.inputs()) {
|
|
if (Node * chunk = usedInFusedChunk(p)) {
|
|
int64_t dim = chunk->i(attr::dim);
|
|
int64_t chunks = chunk->i(attr::chunks);
|
|
chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim);
|
|
for (auto * o : chunk->outputs()) {
|
|
flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc);
|
|
}
|
|
} else {
|
|
chunk_desc.emplace_back();
|
|
flat_inputs.emplace_back(p, agraph.input_desc[input_index++]);
|
|
}
|
|
}
|
|
for (auto & input : flat_inputs) {
|
|
emitFormal(input.first, input.second);
|
|
}
|
|
}
|
|
|
|
std::vector<PartitionDesc> concat_desc;
|
|
std::vector<std::pair<Value*,TensorDesc>> flat_output_nodes;
|
|
{
|
|
size_t i = 0;
|
|
for(auto o : subgraph.outputs()) {
|
|
auto & desc = agraph.output_desc[i++];
|
|
if(o->node()->kind() != prim::FusedConcat) {
|
|
emitFormal(o, desc);
|
|
concat_desc.emplace_back();
|
|
flat_output_nodes.emplace_back(o, desc);
|
|
} else {
|
|
auto cat = o->node();
|
|
concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
|
|
for(auto c : cat->inputs()) {
|
|
emitFormal(c, *concat_desc.back().subtensorDesc);
|
|
flat_output_nodes.emplace_back(c, desc);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool has_half_tensor = false;
|
|
size_t formal_count = 0;
|
|
for(auto input : flat_inputs) {
|
|
auto p = input.first;
|
|
env.s("node",valueName(p));
|
|
env.d("formal",formal_count++);
|
|
|
|
// Acquires and converts (if needed) inputs
|
|
bool is_half = input.second.scalar_type == at::ScalarType::Half;
|
|
if (is_half) {
|
|
AT_ASSERT(use_cuda);
|
|
env.s(
|
|
"access"
|
|
, format("__half2float(t${formal}.data[t${formal}_offset])", env));
|
|
has_half_tensor = true;
|
|
} else {
|
|
env.s("access", format("t${formal}.data[t${formal}_offset]", env));
|
|
}
|
|
|
|
//TODO: actual type propagation rather than relying on auto..
|
|
body << format("auto ${node} = ${access};\n",env);
|
|
}
|
|
|
|
for(auto n : subgraph.nodes()) {
|
|
// FusedConcat nodes work by narrowing the output Tensors before the kernel runs
|
|
if (n->kind() == prim::FusedConcat)
|
|
continue;
|
|
if (n->kind() == prim::FusedChunk)
|
|
continue;
|
|
if(n->kind() == aten::rand_like) {
|
|
has_random = true;
|
|
if(!use_cuda)
|
|
throw std::runtime_error("Fusion doesn't support rand on CPU");
|
|
}
|
|
env.s("node",valueName(n->output()));
|
|
env.s("rhs", encodeRHS(n));
|
|
body << format("auto ${node} = ${rhs};\n",env);
|
|
}
|
|
|
|
for(auto output : flat_output_nodes) {
|
|
auto o = output.first;
|
|
env.d("formal",formal_count++);
|
|
env.s("access",format("t${formal}.data[t${formal}_offset]",env));
|
|
env.s("node",valueName(o));
|
|
|
|
// Acquires and converts (if needed) outputs
|
|
bool is_half = output.second.scalar_type == at::ScalarType::Half;
|
|
if (is_half) {
|
|
AT_ASSERT(use_cuda);
|
|
body << format("${access} = __float2half(${node});\n",env);
|
|
has_half_tensor = true;
|
|
} else {
|
|
body << format("${access} = ${node};\n",env);
|
|
}
|
|
}
|
|
|
|
// Includes half support if any half tensors are involved
|
|
if (has_half_tensor) {
|
|
env.s("HalfHeader", half_support_literal);
|
|
} else {
|
|
env.s("HalfHeader", "");
|
|
}
|
|
|
|
if (has_random) {
|
|
env.s("RandHeader", rand_support_literal);
|
|
env.s("RandParam", rand_param);
|
|
env.s("RandInit", rand_init);
|
|
} else {
|
|
env.s("RandHeader", "");
|
|
env.s("RandParam", "");
|
|
env.s("RandInit", "");
|
|
}
|
|
|
|
env.s("tensorOffsets",tensorOffsets.str());
|
|
env.s("kernelBody",body.str());
|
|
env.v("formals",formals);
|
|
env.v("argument_loads",argument_loads);
|
|
env.s("type_declarations", type_declarations_template.format(env));
|
|
if(use_cuda) {
|
|
out << cuda_compilation_unit_template.format(env);
|
|
} else {
|
|
out << cpu_compilation_unit_template.format(env);
|
|
}
|
|
|
|
return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
} // codegen namespace
|
|
} // anonymous namespace
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// CompiledFunctionFunction
|
|
|
|
// Host-side view of TensorInfo (that visivle for the kernel is defined above).
|
|
// Note dims[0] - we need to dynamically allocate the dims.
|
|
struct TensorInfo {
|
|
void * data;
|
|
#pragma GCC diagnostic ignored "-Wpedantic"
|
|
uint32_t sizes_strides[0];
|
|
#pragma GCC diagnostic pop
|
|
|
|
uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
|
|
uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
|
|
};
|
|
|
|
FusedKernel::FusedKernel(const std::string & name, AnnotatedGraph & agraph)
|
|
: name(name)
|
|
, input_desc(agraph.input_desc)
|
|
, output_desc(agraph.output_desc) {}
|
|
|
|
namespace {
|
|
|
|
// Tries to compress sizes and strides according to cont. Emits the result t
|
|
// c_sizes, c_strides and throws an error on failure (if can't compress)
|
|
void compressContiguous(
|
|
at::IntList sizes,
|
|
at::IntList strides,
|
|
const std::vector<bool> & cont,
|
|
uint32_t * c_sizes,
|
|
uint32_t * c_strides) {
|
|
size_t compressed_dims = 0;
|
|
size_t cur = 0;
|
|
size_t ndim = sizes.size();
|
|
while(cur < ndim) {
|
|
size_t total_size = sizes[cur];
|
|
cur++;
|
|
while(cont[cur-1] && cur < ndim) {
|
|
JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
|
|
total_size *= sizes[cur];
|
|
cur++;
|
|
}
|
|
// cur starts pointing at the beginning of run to compress
|
|
// cur ends one _after_ the terminating false or end of list.
|
|
// total_size is the size of all dimensions [begin,end)
|
|
// examples:
|
|
// f = not cont.
|
|
// t = cont.
|
|
// x = don't care, including past end of list
|
|
// s = start of cur
|
|
// e = end of cur
|
|
|
|
|
|
// f x x x
|
|
// s e
|
|
|
|
// t f x x
|
|
// s e
|
|
|
|
// t t f x
|
|
// s e
|
|
|
|
c_sizes[compressed_dims] = total_size;
|
|
c_strides[compressed_dims] = strides[cur-1];
|
|
compressed_dims++;
|
|
}
|
|
JIT_ASSERT(!cont.back() || strides.back() == 1);
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
// XXX: Assumes that after at::chunk, all inputs are the same size
|
|
static std::vector<int64_t> computeMapSize(
|
|
const at::Tensor& tensor,
|
|
const PartitionDesc& chunkDesc) {
|
|
std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
|
|
// Should have been checked in graph fuser
|
|
JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0);
|
|
sizes[chunkDesc.dim] /= chunkDesc.nSubtensors;
|
|
return sizes;
|
|
}
|
|
|
|
// XXX: this code assumes that inputs are 32-bit addressable
|
|
static uint32_t computeNumel(at::ArrayRef<int64_t> sizes) {
|
|
uint32_t result = 1;
|
|
if (sizes.size() == 0) {
|
|
return 1; // scalar tensor
|
|
}
|
|
for (int64_t size : sizes) {
|
|
result *= size;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void FusedKernel::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
|
|
at::DeviceGuard device_guard(inputs);
|
|
JIT_ASSERT(inputs.size() == input_desc.size());
|
|
JIT_ASSERT(outputs.size() == output_desc.size());
|
|
size_t flat_inputs_size = 0;
|
|
size_t flat_outputs_size = 0;
|
|
for(auto & c : chunk_desc)
|
|
flat_inputs_size += c.nSubtensors;
|
|
for(auto & c : concat_desc)
|
|
flat_outputs_size += c.nSubtensors;
|
|
// XXX: this code assumes that inputs are 32-bit addressable
|
|
// XXX: this code assumes that all inputs are of the same size
|
|
JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
|
|
|
|
// Compute map_size, numel from the first input
|
|
at::IntList map_size;
|
|
uint32_t numel;
|
|
std::vector<int64_t> keep_alive_size;
|
|
if (chunk_desc[0].isNoop()) {
|
|
map_size = inputs[0].sizes();
|
|
numel = inputs[0].numel();
|
|
} else {
|
|
keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]);
|
|
map_size = keep_alive_size;
|
|
numel = computeNumel(map_size);
|
|
}
|
|
|
|
// Compute the storage needed to store TensorInfo structs for inputs and outputs.
|
|
size_t uncompressedDim = input_desc.at(0).contiguity.size();
|
|
size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
|
|
size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
|
|
std::vector<char> buffer(maxPossibleBufferSize);
|
|
char * buffer_next = buffer.data();
|
|
// A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
|
|
std::vector<void*> arguments;
|
|
arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
|
|
auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) {
|
|
size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
|
|
JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
|
|
auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
|
|
ti->data = data_ptr;
|
|
compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
|
|
buffer_next += maxPossibleTensorInfoSize;
|
|
arguments.push_back(ti);
|
|
};
|
|
// Asserts that t's dims can be compressed in the same way as in desc
|
|
// (that's what the kernel assumes), and appends it to the arguments vector.
|
|
auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
|
|
addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
|
|
};
|
|
arguments.push_back(&numel);
|
|
for (size_t i = 0; i < input_desc.size(); ++i) {
|
|
auto & chunk = chunk_desc[i];
|
|
const at::Tensor& tensor = inputs[i];
|
|
if (chunk.isNoop()) {
|
|
addTensorInfo(input_desc[i], tensor);
|
|
} else {
|
|
size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType());
|
|
char * data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
|
|
for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) {
|
|
addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides());
|
|
data_ptr += chunk_offset;
|
|
}
|
|
}
|
|
}
|
|
for (size_t i = 0; i < output_desc.size(); ++i) {
|
|
auto & c = concat_desc[i];
|
|
at::Tensor o = outputs[i];
|
|
if(c.isNoop()) {
|
|
o.resize_(map_size);
|
|
addTensorInfo(output_desc[i], outputs[i]);
|
|
} else {
|
|
size_t small_size = map_size[c.dim];
|
|
std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
|
|
concat_size[c.dim] = small_size * c.nSubtensors;
|
|
o.resize_(concat_size);
|
|
size_t offset = 0;
|
|
for(size_t j = 0; j < c.nSubtensors; ++j) {
|
|
// because the concatenated_output stays live, the underlying data
|
|
// in this view remains live through the end of this function
|
|
// so there is not need to hold onto this tensor
|
|
auto view = o.narrow(c.dim, offset, small_size);
|
|
addTensorInfo(*c.subtensorDesc, view);
|
|
offset += small_size;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the kernel call contains a random op, we need to pass in random seeds as
|
|
// well.
|
|
#ifdef USE_CUDA
|
|
if(has_random && this->backend() == at::Backend::CUDA) {
|
|
auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
|
|
uint64_t offset =
|
|
gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel));
|
|
arguments.push_back(&gen_->state.initial_seed);
|
|
arguments.push_back(&offset);
|
|
}
|
|
#endif
|
|
|
|
launch_raw(numel, arguments.data());
|
|
}
|
|
|
|
void FusedKernel::launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs) {
|
|
at::DeviceGuard guard(inputs.back());
|
|
JIT_ASSERT(inputs.size() > 0);
|
|
auto & ref_type = inputs[0].type();
|
|
outputs.clear();
|
|
outputs.reserve(outputDescriptors().size());
|
|
for(auto & od : outputDescriptors()) {
|
|
outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
|
|
}
|
|
launch_with_tensors(inputs, outputs);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// CUDAFusedKernel
|
|
|
|
#ifdef USE_CUDA
|
|
|
|
void checkCUDAVersion(const cudaDeviceProp & prop) {
|
|
if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
|
|
(prop.major >= 7 && CUDA_VERSION < 9000)) {
|
|
std::stringstream err_string;
|
|
err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: "
|
|
<< CUDA_VERSION << " for the current GPU device " << prop.name
|
|
<< " with device capability " << prop.major << "." << prop.minor;
|
|
throw std::runtime_error(err_string.str());
|
|
}
|
|
}
|
|
|
|
struct CUDAFusedKernel : public FusedKernel {
|
|
CUDAFusedKernel(const std::string & name, AnnotatedGraph & agraph)
|
|
: FusedKernel(name, agraph) {
|
|
at::DeviceGuard device_guard(agraph.device);
|
|
|
|
TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
|
|
checkCUDAVersion(prop);
|
|
|
|
std::stringstream cu;
|
|
std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, true);
|
|
compilation_unit = cu.str();
|
|
nvrtcProgram program;
|
|
TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr));
|
|
|
|
std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
|
|
std::vector<const char *> args = {"--std=c++11", compute.c_str(), "-default-device"};
|
|
nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
|
|
if (result == NVRTC_ERROR_COMPILATION) {
|
|
size_t logsize;
|
|
nvrtcGetProgramLogSize(program, &logsize);
|
|
std::vector<char> log(logsize);
|
|
nvrtcGetProgramLog(program, log.data());
|
|
cu << log.data();
|
|
throw std::runtime_error(cu.str());
|
|
}
|
|
ResourceGuard holdProgram([&] {
|
|
TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program));
|
|
});
|
|
TORCH_NVRTC_CHECK(result);
|
|
|
|
size_t ptx_size;
|
|
TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
|
|
ptx.resize(ptx_size);
|
|
TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
|
|
|
|
TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
|
|
TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
|
|
|
|
TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
|
|
&maxBlocks, function, 128, 0));
|
|
maxBlocks *= prop.multiProcessorCount;
|
|
}
|
|
virtual ~CUDAFusedKernel() override {
|
|
TORCH_CU_CHECK(cuModuleUnload(module));
|
|
}
|
|
protected:
|
|
virtual at::Backend backend() const override {
|
|
return at::Backend::CUDA;
|
|
}
|
|
virtual uint64_t get_rand_offset(uint32_t numel) override {
|
|
int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
|
|
return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1);
|
|
}
|
|
virtual void launch_raw(uint32_t numel, void ** arguments) override {
|
|
int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
|
|
|
|
//std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
|
|
// << " numblocks = " << numBlocks;
|
|
|
|
// it is possible that this is the first cuda call on this thread
|
|
// so make sure we initialize the Driver API's context
|
|
// cudaFree(0) accomplishes this.
|
|
CUcontext pctx = 0;
|
|
TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
|
|
if (!pctx) {
|
|
std::unique_lock<std::mutex> cudaFreeMutexLock(
|
|
*(THCCachingAllocator_getCudaFreeMutex()));
|
|
cudaFree(0);
|
|
}
|
|
CUstream stream = at::cuda::getCurrentCUDAStream();
|
|
TORCH_CU_CHECK(cuLaunchKernel(
|
|
function,
|
|
numBlocks, 1, 1,
|
|
blockSize, 1, 1,
|
|
0, stream,
|
|
arguments,
|
|
nullptr));
|
|
}
|
|
std::vector<char> ptx;
|
|
CUmodule module;
|
|
CUfunction function;
|
|
|
|
// we record prop/device so if they are availiable for launch heuristics
|
|
// querying at launch is too slow for device properties.
|
|
int device;
|
|
cudaDeviceProp prop;
|
|
int blockSize = 128;
|
|
int maxBlocks;
|
|
};
|
|
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// CPUFusedKernel
|
|
|
|
struct TempFile {
|
|
TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
|
|
TempFile(const std::string & t, int suffix) {
|
|
// mkstemps edits its first argument in places
|
|
// so we make a copy of the string here, including null terminator
|
|
std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
|
|
int fd = mkstemps(tt.data(), suffix);
|
|
JIT_ASSERT(fd != -1);
|
|
file_ = fdopen(fd, "r+");
|
|
|
|
// - 1 becuase tt.size() includes the null terminator,
|
|
// but std::string does not expect one
|
|
name_ = std::string(tt.begin(), tt.end() - 1);
|
|
}
|
|
const std::string & name() const {
|
|
return name_;
|
|
}
|
|
void sync() {
|
|
fflush(file_);
|
|
}
|
|
void write(const std::string & str) {
|
|
size_t result = fwrite(str.c_str(), 1, str.size(), file_);
|
|
JIT_ASSERT(str.size() == result);
|
|
}
|
|
FILE* file() {
|
|
return file_;
|
|
}
|
|
~TempFile() {
|
|
if(file_ != nullptr) {
|
|
// unlink first to ensure another mkstemps doesn't
|
|
// race between close and unlink
|
|
unlink(name_.c_str());
|
|
fclose(file_);
|
|
}
|
|
}
|
|
private:
|
|
FILE * file_ = nullptr;
|
|
std::string name_;
|
|
};
|
|
|
|
static void* checkDL(void * x) {
|
|
if(!x) {
|
|
AT_ERROR("error in dlopen or dlsym: ", dlerror());
|
|
}
|
|
return x;
|
|
}
|
|
|
|
struct DynamicLibrary {
|
|
TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
|
|
DynamicLibrary(const char * name) {
|
|
handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
|
|
}
|
|
void * sym(const char * name) {
|
|
JIT_ASSERT(handle);
|
|
return checkDL(dlsym(handle, name));
|
|
}
|
|
~DynamicLibrary() {
|
|
if(!handle) return;
|
|
dlclose(handle);
|
|
}
|
|
private:
|
|
void * handle = nullptr;
|
|
};
|
|
|
|
static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
|
|
static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
|
|
|
|
// NB: -march=native not supported on PPC64 g++. It's a bit annoying
|
|
// to do a configure-style test to decide whether or not the g++
|
|
// actually supports it or not, so we heuristically use the host
|
|
// compiler to predict if the runtime compiler supports the option we
|
|
// want. This probably won't work if you're cross-compiling.
|
|
// NB: -march=native is disabled because it has caused problems where
|
|
// compiler and assembler do not agree on what native instruction they
|
|
// understand for AVX512. When we need better CPU performance this
|
|
// optimization can be re-enabled by tracking down the platforms where
|
|
// this error occurs and only selectively disabling it.
|
|
static const std::string compile_string =
|
|
"\"${cxx}\" -O3 -g "
|
|
#ifndef __PPC64__
|
|
// "-march=native "
|
|
#endif
|
|
"-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
|
|
|
|
static void runCompiler(FusionCompilerConfig & config, const std::string & cpp_file, const std::string & so_file) {
|
|
TemplateEnv env;
|
|
env.s("cxx", config.cxx);
|
|
env.s("fopenmp", config.openmp ? "-fopenmp" : "");
|
|
env.s("cpp_file",cpp_file);
|
|
env.s("so_file",so_file);
|
|
std::string result = format(compile_string,env);
|
|
int r = system(result.c_str());
|
|
if(config.openmp && r != 0) {
|
|
std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
|
|
config.openmp = false; // disable for future compiles
|
|
return runCompiler(config, cpp_file, so_file);
|
|
}
|
|
JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel");
|
|
}
|
|
|
|
|
|
static const std::string disas_string =
|
|
"objdump -M intel -d \"${so_file}\"";
|
|
static void disas(const std::string & so_file) {
|
|
TemplateEnv env;
|
|
env.s("so_file", so_file);
|
|
std::string cmd = format(disas_string, env);
|
|
int r = system(cmd.c_str());
|
|
JIT_ASSERT(r == 0);
|
|
}
|
|
|
|
struct CPUFusedKernel : public FusedKernel {
|
|
CPUFusedKernel(const std::string & name, AnnotatedGraph & agraph, FusionCompilerConfig & config)
|
|
: FusedKernel(name, agraph) {
|
|
TempFile so_file(so_template, 3);
|
|
TempFile cpp_file(cpp_template, 4);
|
|
|
|
std::stringstream cu;
|
|
std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, false);
|
|
JIT_ASSERT(!has_random);
|
|
compilation_unit = cu.str();
|
|
cpp_file.write(compilation_unit);
|
|
cpp_file.sync();
|
|
runCompiler(config, cpp_file.name(), so_file.name());
|
|
if(config.debug) {
|
|
disas(so_file.name());
|
|
}
|
|
so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
|
|
#pragma GCC diagnostic ignored "-Wpedantic"
|
|
kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
|
|
#pragma GCC diagnostic pop
|
|
}
|
|
protected:
|
|
virtual at::Backend backend() const override {
|
|
return at::Backend::CPU;
|
|
}
|
|
virtual uint64_t get_rand_offset(uint32_t numel) override {
|
|
return numel;
|
|
}
|
|
virtual void launch_raw(uint32_t numel, void ** arguments) override {
|
|
kernel(numel, arguments);
|
|
}
|
|
std::unique_ptr<DynamicLibrary> so_lib;
|
|
void (*kernel)(uint32_t, void**) = nullptr;
|
|
};
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// FusedKernelCache
|
|
|
|
// Note [Run-time shape checking code]
|
|
// There are multiple assumptions that our codegen makes, which we can't check
|
|
// in the fusion pass, because we don't have the shape information. Most notably,
|
|
// that all values (post-input-chunk, and pre-output-concat) have the same shape
|
|
// (hereinafter referred to as map size). One way to check this would be to run
|
|
// shape propagation for every size configuration we get as an input, but that
|
|
// requires a full graph traversal, and might incur unnecessary overhead. The code
|
|
// below uses a few nice properties of broadcasting rules and their interactions with
|
|
// pointwise operations, and takes a smarter approach, to quickly verify validity of
|
|
// the kernel.
|
|
//
|
|
// Notation:
|
|
// - a.s when a is a tensor is a shorthand for a.shape.
|
|
// - B is a shorthand for the broadcasting/expanding function. It is used as a
|
|
// vararg function.
|
|
// - E is a shorthand for expand function.
|
|
// - Every pointwise operation can be equivalently rewritten as
|
|
// f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))),
|
|
// where f^ is a non-broadcasting verison of f.
|
|
// - A set of inputs that are used to produce a certain graph output is referred to
|
|
// as the output's broadcasting group (see Lemma 2. for explanation why).
|
|
//
|
|
// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the
|
|
// operation of broadcasting (returning bottom upon shape mismatch) forms a monoid.
|
|
// In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c).
|
|
//
|
|
// Proof. Satisfies all monoid laws:
|
|
// - Closed under broadcasting (trivial)
|
|
// - Empty shape is the identity element: B(a, []) == B([], a) == a
|
|
// - Associativity: A simple visual proof is that you can expand 3 tensors
|
|
// at the same time by stacking their sizes (with alignment to the right),
|
|
// just as you'd do in the case of 2 tensors, but with an intermediate
|
|
// (the algorithm ends up being pretty much the same).
|
|
//
|
|
// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set
|
|
// of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]).
|
|
//
|
|
// Proof. Let G be any DAG of pointwise ops and < be any valid topological
|
|
// ordering on nodes of G. Proof by induction over <.
|
|
// Base case (graph input):
|
|
// Trivial (input is also an output).
|
|
// Step (n = f(q, r)):
|
|
// Let QS (RS) be the set of shapes of inputs that q (r) depends on.
|
|
// Note that the set of inputs that n depends on is exactly QS + RS.
|
|
// shape(n) == shape(f(q, r))
|
|
// (def of f)
|
|
// == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s))))
|
|
// (output shape of f^ is equal to either of argument shapes)
|
|
// == shape(E(q, B(q.s, r.s)))
|
|
// (property of expand)
|
|
// == B(q.s, r.s)
|
|
// (induction assumption)
|
|
// == B(B(QS...), B(RS...))
|
|
// (Lemma 1.)
|
|
// == B(QS..., RS...)
|
|
// (repeated shapes don't matter for broadcasting)
|
|
// == B((QS + RS)...)
|
|
//
|
|
// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s))
|
|
// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)).
|
|
// Proof. A simple exercise for the reader :)
|
|
//
|
|
// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to
|
|
// (post-chunk-)inputs, and have all intermediates of the same shape
|
|
// (no broadcasting happening in the body).
|
|
//
|
|
// Proof. Using the above lemmas we can easily show that a graph with a single output
|
|
// can be easily rewritten by taking the shape given by B applied to all input
|
|
// shapes, expanding inputs to it, and using only non-broadcasting operations.
|
|
// Example:
|
|
//
|
|
// let d = f(a, b) in
|
|
// let e = h(b, c) in
|
|
// g(d, e)
|
|
//
|
|
// (By def. of broadcasting pointwise ops applied to g, f and h)
|
|
// (Lemma 2. for a closed formula for the size of g = gs)
|
|
//
|
|
// let gs = B(a.s, b.s, c.s) in
|
|
// let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in
|
|
// let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in
|
|
// g^(d', e')
|
|
//
|
|
// (Lemma 3.)
|
|
//
|
|
// let gs = B(a.s, b.s, c.s) in
|
|
// let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in
|
|
// let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in
|
|
// g^(d', e')
|
|
//
|
|
// (Lemma 4. + Lemma 1. to simplify broadcasting function)
|
|
//
|
|
// let gs = B(a.s, b.s, c.s) in
|
|
// let d' = f^(E(a, gs), E(b, gs)) in
|
|
// let e' = h^(E(b, gs), E(c, gs)) in
|
|
// g^(d', e')
|
|
//
|
|
// (Simple rewrite)
|
|
//
|
|
// let gs = B(a.s, b.s, c.s) in
|
|
// let a' = E(a, gs) in
|
|
// let b' = E(b, gs) in
|
|
// let c' = E(c, gs) in
|
|
// let d' = f^(a', b') in
|
|
// let e' = h^(b', c') in
|
|
// g^(d', e')
|
|
//
|
|
// This example can be easily formalized to arbitrary DAGs using induction
|
|
// over topological ordering, similar to Lemma 2. Now, if broadcasting groups
|
|
// for all outputs have the same shape, then performing an expand to this size
|
|
// on all inputs will ensure that all intermediates on all paths to outputs
|
|
// will have the same shape, proving that the body of the kernel is valid.
|
|
//
|
|
// This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs
|
|
// is straightforward (needs a simple lemma for moving expands through chunks).
|
|
|
|
// Register implementations of fused operators, so that we can reuse the fused graph
|
|
// to generate fallback code.
|
|
RegisterOperators reg_fused_operators({
|
|
Operator(
|
|
prim::FusedChunk,
|
|
[](Node* node) {
|
|
int64_t dim = node->i(attr::dim);
|
|
int64_t chunks = node->outputs().size();
|
|
return [dim, chunks](Stack& stack) {
|
|
auto result = at::chunk(std::move(peek(stack, 0, 1)).toTensor(), chunks, dim);
|
|
drop(stack, 1);
|
|
pack(stack, std::move(result));
|
|
return 0;
|
|
};
|
|
}),
|
|
Operator(
|
|
prim::FusedConcat,
|
|
[](Node* node) {
|
|
int64_t dim = node->i(attr::dim);
|
|
int64_t num_inputs = node->inputs().size();
|
|
return [dim, num_inputs](Stack& stack) {
|
|
auto result = at::cat(
|
|
fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }),
|
|
dim
|
|
);
|
|
drop(stack, num_inputs);
|
|
pack(stack, std::move(result));
|
|
return 0;
|
|
};
|
|
})
|
|
});
|
|
|
|
FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> _graph, int device)
|
|
: device(device)
|
|
, fallback_code(_graph)
|
|
, compiler(compiler)
|
|
, graph(std::move(_graph))
|
|
, input_broadcast_groups(getInputBroadcastGroups())
|
|
, input_chunks(getInputChunkDescriptors())
|
|
, kernels() {}
|
|
|
|
std::atomic<size_t> FusedKernelCache::next_kernel_id {0};
|
|
|
|
auto FusedKernelCache::getInputChunkDescriptors() -> std::vector<PartitionInfo> {
|
|
std::vector<PartitionInfo> descs;
|
|
descs.reserve(graph->inputs().size());
|
|
for (Value * input : graph->inputs()) {
|
|
if (Node * chunk = usedInFusedChunk(input)) {
|
|
descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim));
|
|
} else {
|
|
descs.emplace_back(1, 0);
|
|
}
|
|
}
|
|
return descs;
|
|
}
|
|
|
|
// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access
|
|
static std::vector<int64_t> getInputDependencies(Value* output) {
|
|
// Run a DFS traversal to find all inputs that affect a given output value
|
|
std::vector<Value*> queue { output };
|
|
std::unordered_set<Value*> inputs;
|
|
std::unordered_set<Value*> seen;
|
|
while (!queue.empty()) {
|
|
Value * val = queue.back(); queue.pop_back();
|
|
Node * producer = val->node();
|
|
if (producer->kind() == prim::Param) {
|
|
inputs.insert(val);
|
|
continue;
|
|
}
|
|
for (Value * input : producer->inputs()) {
|
|
if (/*bool inserted = */seen.insert(input).second) {
|
|
queue.push_back(input);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert Value* into offsets into the graph's input list
|
|
std::vector<int64_t> offsets;
|
|
offsets.reserve(inputs.size());
|
|
for (Value * input : inputs) {
|
|
offsets.push_back(input->offset());
|
|
}
|
|
std::sort(offsets.begin(), offsets.end());
|
|
return offsets;
|
|
}
|
|
|
|
std::vector<std::vector<int64_t>> FusedKernelCache::getInputBroadcastGroups() {
|
|
std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>> broadcast_groups;
|
|
for (Value * output : graph->outputs()) {
|
|
broadcast_groups.insert(getInputDependencies(output));
|
|
}
|
|
return std::vector<std::vector<int64_t>>{ broadcast_groups.begin(), broadcast_groups.end() };
|
|
}
|
|
|
|
void FusedKernelCache::run(Stack& stack) {
|
|
int64_t num_inputs = graph->inputs().size();
|
|
auto args = fmap(last(stack, num_inputs), [](const IValue& i) {
|
|
return i.toTensor();
|
|
});
|
|
|
|
auto maybe_map_size = canRunKernel(args);
|
|
if (!maybe_map_size) {
|
|
return runFallback(stack);
|
|
}
|
|
expandArgs(args, *maybe_map_size);
|
|
|
|
FusedKernelArgSpec spec { args };
|
|
auto it = kernels.find(spec);
|
|
if (it == kernels.end()) {
|
|
std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size));
|
|
}
|
|
auto & fn = it->second;
|
|
|
|
std::vector<at::Tensor> outputs;
|
|
fn->launch(args, outputs);
|
|
drop(stack, num_inputs);
|
|
pack(stack, std::move(outputs));
|
|
}
|
|
|
|
at::optional<std::vector<int64_t>> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) {
|
|
int64_t dim_after_broadcast = 0;
|
|
for (int64_t arg_idx : arg_subset) {
|
|
dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim());
|
|
}
|
|
// TODO: this keeps reallocating map_size at every iteration, but we know
|
|
// exactly how much storage do we need, so this could be fixed in-place at
|
|
// every step. We're just missing a few functions for ATen, but the fix
|
|
// should be straightforward.
|
|
// NB: we leave this uninitialized, because an empty size is trivially
|
|
// broadcastable to any other size.
|
|
std::vector<int64_t> map_size;
|
|
for (size_t i = 0; i < arg_subset.size(); ++i) {
|
|
auto & arg = args.at(arg_subset[i]);
|
|
auto & chunk_desc = input_chunks.at(arg_subset[i]);
|
|
if (chunk_desc.nSubtensors == 1) {
|
|
try {
|
|
map_size = at::infer_size(map_size, arg.sizes());
|
|
} catch (std::exception& e) {
|
|
return at::nullopt;
|
|
}
|
|
} else {
|
|
auto tensor_sizes = arg.sizes().vec();
|
|
int64_t num_chunks = chunk_desc.nSubtensors;
|
|
int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size());
|
|
if (tensor_sizes[dim] % num_chunks != 0) {
|
|
return at::nullopt;
|
|
}
|
|
tensor_sizes[dim] /= num_chunks;
|
|
try {
|
|
map_size = at::infer_size(map_size, tensor_sizes);
|
|
} catch (std::exception& e) {
|
|
return at::nullopt;
|
|
}
|
|
}
|
|
}
|
|
|
|
return {map_size};
|
|
}
|
|
|
|
// See Note [Run-time shape checking code] for more explanation on the algorithm.
|
|
at::optional<std::vector<int64_t>> FusedKernelCache::canRunKernel(at::TensorList args) {
|
|
AT_CHECK(args.size() == input_chunks.size(),
|
|
"Expected ", input_chunks.size(), " arguments, but got ", args.size());
|
|
|
|
at::optional<std::vector<int64_t>> map_size;
|
|
for (const auto & broadcast_group : input_broadcast_groups) {
|
|
if (!map_size) {
|
|
map_size = getMapSize(args, broadcast_group);
|
|
if (!map_size) {
|
|
return at::nullopt;
|
|
}
|
|
} else {
|
|
auto group_map_size = getMapSize(args, broadcast_group);
|
|
// NB: this checks that group_map_size is defined AND equal to map_size
|
|
if (map_size != group_map_size) {
|
|
return at::nullopt;
|
|
}
|
|
}
|
|
}
|
|
return map_size;
|
|
}
|
|
|
|
void FusedKernelCache::runFallback(Stack& stack) {
|
|
InterpreterState(fallback_code).runOneStage(stack);
|
|
}
|
|
|
|
// NB: args are mutated in this call. map_size is mutated too, but is restored to its original
|
|
// value before this function returns (it's an optimization).
|
|
void FusedKernelCache::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {
|
|
for (size_t i = 0; i < args.size(); ++i) {
|
|
auto & arg = args[i];
|
|
auto & pdesc = input_chunks[i];
|
|
if (pdesc.nSubtensors == 1) {
|
|
if (arg.sizes().equals(map_size)) continue;
|
|
arg = arg.expand(map_size);
|
|
} else {
|
|
map_size.at(pdesc.dim) *= pdesc.nSubtensors;
|
|
if (!arg.sizes().equals(map_size)) {
|
|
arg = arg.expand(map_size);
|
|
}
|
|
map_size.at(pdesc.dim) /= pdesc.nSubtensors;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<FusedKernel> FusedKernelCache::compileSpec(
|
|
const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size) {
|
|
AnnotatedGraph agraph {*graph, device};
|
|
|
|
agraph.input_desc = spec.descs();
|
|
// XXX: this assumes that fused kernels only operate on floating-point values inside
|
|
at::optional<at::ScalarType> scalar_type;
|
|
for (TensorDesc& desc : agraph.input_desc) {
|
|
if (isFloatingType(desc.scalar_type)) {
|
|
scalar_type = desc.scalar_type;
|
|
break;
|
|
}
|
|
}
|
|
JIT_ASSERT(scalar_type);
|
|
|
|
for (Value * output : graph->outputs()) {
|
|
std::vector<int64_t> sizes = map_size;
|
|
if (output->node()->kind() == prim::FusedConcat) {
|
|
sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size();
|
|
}
|
|
auto type = CompleteTensorType::create(*scalar_type, device, sizes);
|
|
agraph.output_desc.emplace_back(std::move(type));
|
|
}
|
|
|
|
std::string name = "kernel_" + std::to_string(next_kernel_id++);
|
|
FusedKernel * raw_func;
|
|
if (device != kCPUDevice) {
|
|
#ifdef USE_CUDA
|
|
raw_func = new CUDAFusedKernel(name, agraph);
|
|
#else
|
|
throw std::runtime_error("cannot compile a CUDA fusion group, CUDA is not enabled.");
|
|
#endif
|
|
} else {
|
|
JIT_ASSERT(compiler.canCompileOnCPU());
|
|
raw_func = new CPUFusedKernel(name, agraph, compiler.config_);
|
|
}
|
|
return std::unique_ptr<FusedKernel>(raw_func);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// FusionCompiler
|
|
|
|
std::shared_ptr<FusedKernelCache> FusionCompiler::getOrCompile(Node* fusion_group) {
|
|
int device = fusion_group->i(attr::device);
|
|
if (device == kCPUDevice) {
|
|
JIT_ASSERT(canCompileOnCPU());
|
|
} else {
|
|
#ifndef USE_CUDA
|
|
throw std::runtime_error("cannot compile a CUDA fusion group - CUDA is not enabled.");
|
|
#endif
|
|
}
|
|
auto graph = fusion_group->g(attr::Subgraph)->copy();
|
|
EraseShapeInformation(*graph);
|
|
std::stringstream key;
|
|
key << "device " << device << "\n";
|
|
key << *graph << "\n";
|
|
std::string key_ = key.str();
|
|
auto it = cache_map.find(key_);
|
|
if (it == cache_map.end()) {
|
|
std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared<FusedKernelCache>(*this, graph, device));
|
|
}
|
|
return it->second;
|
|
}
|
|
|
|
std::vector<at::Tensor> FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs) {
|
|
auto wrapper_graph = std::make_shared<Graph>();
|
|
Node * fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
|
|
fusion_group->g_(attr::Subgraph, graph.copy());
|
|
for (size_t i = 0; i < graph.inputs().size(); ++i) {
|
|
fusion_group->addInput(wrapper_graph->addInput());
|
|
}
|
|
for (size_t i = 0; i < graph.outputs().size(); ++i) {
|
|
wrapper_graph->registerOutput(fusion_group->addOutput());
|
|
}
|
|
auto cache = getOrCompile(fusion_group);
|
|
Stack stack = fmap<IValue>(inputs);
|
|
cache->run(stack);
|
|
return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
|
|
}
|
|
|
|
static const std::string check_exists_string =
|
|
"which '${program}' > /dev/null";
|
|
|
|
static bool programExists(const std::string & program) {
|
|
TemplateEnv env;
|
|
env.s("program", program);
|
|
std::string cmd = format(check_exists_string, env);
|
|
return 0 == system(cmd.c_str());
|
|
}
|
|
|
|
FusionCompiler::FusionCompiler() {
|
|
const char * cxx_env = getenv("CXX");
|
|
if(cxx_env != nullptr) {
|
|
config_.cxx = cxx_env;
|
|
}
|
|
if(!programExists(config_.cxx)) {
|
|
config_.cxx = "";
|
|
}
|
|
const char * debug_env = getenv("PYTORCH_FUSION_DEBUG");
|
|
config_.debug = debug_env && atoi(debug_env) != 0;
|
|
}
|
|
|
|
//TODO: thread safety
|
|
FusionCompiler & sharedFusionCompiler() {
|
|
static FusionCompiler compiler;
|
|
return compiler;
|
|
}
|
|
|
|
}}
|
|
|
|
# else
|
|
// dummy implementations for windows
|
|
|
|
#include "torch/csrc/jit/fusion_compiler.h"
|
|
#include "torch/csrc/jit/ir.h"
|
|
#include "torch/csrc/jit/code_template.h"
|
|
#include "torch/csrc/jit/resource_guard.h"
|
|
#include "torch/csrc/utils/disallow_copy.h"
|
|
#include "ATen/ATen.h"
|
|
#ifdef USE_CUDA
|
|
#include "torch/csrc/cuda/cuda_check.h"
|
|
#include <nvrtc.h>
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#endif
|
|
#include <string>
|
|
#include <algorithm>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
|
|
namespace torch { namespace jit {
|
|
|
|
struct FusedKernel {
|
|
char padding;
|
|
};
|
|
|
|
FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> graph, int device)
|
|
: compiler(compiler) {}
|
|
void FusedKernelCache::run(Stack& inputs) {}
|
|
void FusedKernelCache::runFallback(Stack& stack) {}
|
|
void FusedKernelCache::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {}
|
|
at::optional<std::vector<int64_t>> FusedKernelCache::canRunKernel(at::TensorList args) { return at::nullopt; }
|
|
at::optional<std::vector<int64_t>> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) { return at::nullopt; }
|
|
std::vector<std::vector<int64_t>> FusedKernelCache::getInputBroadcastGroups() { return {}; }
|
|
auto FusedKernelCache::getInputChunkDescriptors() -> std::vector<PartitionInfo> { return {}; }
|
|
std::unique_ptr<FusedKernel> FusedKernelCache::compileSpec(
|
|
const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size) { return nullptr; }
|
|
std::atomic<size_t> FusedKernelCache::next_kernel_id {0};
|
|
|
|
FusionCompiler::FusionCompiler() {}
|
|
std::shared_ptr<FusedKernelCache> FusionCompiler::getOrCompile(Node* fusion_group) { return nullptr; }
|
|
std::vector<at::Tensor> FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs) { return {}; }
|
|
|
|
FusionCompiler & sharedFusionCompiler() {
|
|
throw std::runtime_error("NYI: fuser is not supported on Windows.");
|
|
}
|
|
|
|
}}
|
|
|
|
# endif
|