pytorch/torch/csrc/jit/codegen/cuda/kernel.cpp
jjsjann123 0e582fbfcc [NVFuser] Upstream push 0907 (#84626)
Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/

Codegen changes include:

- codegen improvement:
i. improved view support on pointwise and transpose scheduler
ii. grouped grid welford added for better outer-norm grid persistence in normalization

- misc:
i. new composite ops added: variance_mean , arange,
ii. fixes misaligned address for transpose scheduler
iii. refactor on separation of compilation API from execution API to prepare us for async compilation
iv. double type support on expression evaluator
v. PYTORCH_NVFUSER_DUMP refactor to save PTX and CUBIN

Commits that's in this PR from the devel branch:
```
89330aa23aa804340b2406ab58899d816e3dc3d2 Tensor factories must set the output shape as its input (#1939)
b2fd01ea9346712c6d6f623ca6addbc4888d008e arange support (#1933)
56c00fd3922dad7dfc57351ad7d780f0f2f8e4ed Double support on all expression evaluators (#1937)
371f28223e57fe3f6b5e50a0a45177e6a5c0785c Improve trivial reduction merge support (#1931)
1d0c26790e5647920b40d419d26815bbe310b3a6 Test `rand` in a fusion with zero tensor input (#1932)
0dab160fb2177d178eef3148c6a529e0855009e9 Fix softmax bwd sizes. (#1890)
ef98f360f6d3e3e1cc662ecb65202d88150f128d Fix a bug (#1936)
63132a0c56508c550084b07fb76a3df865102d00 Propagate permissive mapping information into indexing pass (#1929)
b4ac2c88d78078ee4d8b21c4fc51645b5710a282 Map IterationDomains through view operations. (#1919)
c0a187a7619d7cf9dc920294e15461791e8d6d4d do not use deprecated functions (#1935)
88de85e758c5e4afb7b6e746573c0d9a53b4cea7 Upstream cherry pick fixes 0811 (#1934)
b247dcf7c57dc6ac3f7a799b0a6beb7770536a74 Separate kernel compilation API from kernel execution API (#1914)
b34e3b93ee1a8030730c14af3995dd95665af07d Fix `ir_utils::hasBlockSync` + misc fixes in transpose scheduler (#1924)
14a53e6707f43bf760494c238a46386d69830822 Nullary RNGOp (#1892)
3c3c89e638f5172cafb0761f22bacd1fd695eec3 Misc fixes/tuning for transpose scheduler (#1912)
20cf109c8b44d48f61977e35bae94368985144ac Grouped grid welford (#1921)
6cf7eb024c9e53c358cbe56597e117bad56efefd Transpose scheduler small dim sizes better support (#1910)
9341ea9a5bf42f9b14ccad0c94edbc79fc5bb552 Disabled ViewPersistentShmoo sizes that results in NAN (#1922)
057237f66deeea816bb943d802a97c1b7e4414ab Fix CUDA driver error: misaligned address for transpose scheduler  (#1918)
3fb3d80339e4f794767a53eb8fdd61e64cf404a2 Add variance_mean function using Welford (#1907)
98febf6aa3b8c6fe4fdfb2864cda9e5d30089262 Remove DisableOption::UnrollWithRng (#1913)
ee8ef33a5591b534cf587d347af11e48ba7a15d4 Minor fix for the debug interface of using PTX directly (#1917)
6e8f953351f9dabfd1f991d8431cecb6c2ce684d Add PYTORCH_NVFUSER_DUMP options to save PTX and CUBIN (#1916)
5eefa9a72385f6a4b145680a9dcc52d7e8293763 dopt is only available since nvrtc 11.7 (#1915)
2ec8fc711eafc72451eebf0f5e2a98a38bf3f6ef Kill computeAtBetween (#1911)
d0d106a1d9af118d71673173674e875be35d259d Improve view support on pointwise and transpose scheduler (#1906)
e71e1ecefe67219846070590bbed54bbc7416b79 Fix name clash of RNG with shared memory (#1904)
3381793a253689abf224febc73fd3fe2a0dbc921 Fix mutator and sameAs for expanded IterDomain (#1902)
```

RUN_TORCHBENCH: nvfuser

Differential Revision: [D39324552](https://our.internmc.facebook.com/intern/diff/D39324552)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84626
Approved by: https://github.com/malfet
2022-09-23 20:29:48 +00:00

429 lines
13 KiB
C++

#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
#include <torch/csrc/jit/codegen/cuda/kernel.h>
#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <ATen/cuda/CUDAContext.h>
#include <iostream>
#include <unordered_set>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container)
: ir_container_(ir_container) {}
namespace kir {
namespace {
//! Scan all primary expressions in the Kernel IR and build
//! lists of specialized nodes and other interesting information
class KernelIrScanner : private IrVisitor {
public:
explicit KernelIrScanner(const Kernel* kernel) {
IrVisitor::handle(kernel->topLevelExprs());
const auto gpu_lower = GpuLower::current();
for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) {
auto extent = split->in()->extent();
auto factor = split->factor();
summary_.splits_to_validate.emplace_back(extent, factor);
}
}
const auto& summary() const {
return summary_;
}
private:
using IrVisitor::handle;
void handle(Expr* expr) final {
IrVisitor::handle(expr);
for (auto inp : expr->inputs()) {
handle(inp);
}
for (auto out : expr->outputs()) {
handle(out);
}
}
void handle(BlockSync* sync) final {
// TODO: Move to a dedicated validation pass
// which is not on the common execution/compilation path
if (sync->isWarHazardSync()) {
++summary_.war_hazard_syncs_count;
}
}
void handle(GridSync* sync) final {
summary_.has_cooperative_grid_reduction = true;
}
void handle(Allocate* allocate) final {
switch (allocate->memoryType()) {
case MemoryType::Global:
summary_.global_allocations.push_back(allocate);
break;
case MemoryType::Shared:
summary_.dynamic_smem_allocations.push_back(allocate);
break;
case MemoryType::Local:
if (!ExpressionEvaluator::isConst(allocate->size())) {
summary_.has_dynamic_local_memory_allocations = true;
summary_.dynamic_lmem_allocations.emplace_back(allocate);
}
break;
}
}
void handle(RNGOp* rng_op) final {
summary_.max_rng_offsets =
std::max<int>(summary_.max_rng_offsets, rng_op->getRNGOffset());
}
void handle(TensorIndex* tensor_index) final {
const auto tv = tensor_index->view();
const auto domain = tv->domain();
// Do we have any reductions?
summary_.has_block_reductions =
summary_.has_block_reductions || domain->hasBlockReduction();
// Update the largest smem data type
if (domain->hasBlockReduction() || domain->hasGridReduction() ||
tv->getMemoryType() == MemoryType::Shared) {
const auto data_type = tv->dtype();
const size_t type_size = dataTypeSize(data_type);
if (type_size > max_smem_type_size_) {
max_smem_type_size_ = type_size;
summary_.largest_smem_data_type = data_type;
}
}
}
void handle(WelfordOp* welford_op) final {
summary_.has_welford = true;
TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA<TensorIndex>());
auto out_dom = welford_op->outAvg()->as<TensorIndex>()->view()->domain();
summary_.has_block_welford =
summary_.has_block_welford || out_dom->hasBlockReduction();
}
void handle(GridWelford* grid_welford) final {
summary_.has_welford = true;
summary_.has_grid_welford = true;
summary_.has_grid_reductions = true;
if (grid_welford->welford_op()->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GridReduction* grid_reduction) final {
summary_.has_grid_reductions = true;
if (grid_reduction->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GroupedGridReduction* grid_reduction) final {
summary_.has_grid_reductions = true;
if (grid_reduction->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GroupedGridWelford* grid_welford) final {
summary_.has_welford = true;
summary_.has_grid_welford = true;
summary_.has_grid_reductions = true;
if (grid_welford->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GridBroadcast* grid_broadcast) final {
summary_.has_cooperative_grid_reduction = true;
handle(grid_broadcast->broadcast_op());
}
void handle(BroadcastOp* bop) final {
const ParallelTypeBitmap parallel_types =
GpuLower::current()->threadPredMap().getParallelBroadcastDomains(
bop->out()->as<TensorIndex>()->view());
summary_.broadcast_parallel_types.emplace(bop, parallel_types);
// Do we have block broadcasts?
summary_.has_block_broadcasts =
summary_.has_block_broadcasts || parallel_types.hasTID();
// Do we have grid broadcasts?
summary_.has_grid_broadcasts =
summary_.has_grid_broadcasts || parallel_types.hasBID();
}
private:
size_t max_smem_type_size_ = 0;
KernelSummary summary_;
};
//! Make sure tensors have valid allocations even when parallelized
//! loops potentially have larger iteration counts than the number of
//! threads.
//!
//! When an IterDomain of a tensor is parallelized, the IterDomain
//! may not contribute to the allocation of the tensor. For example,
//! it is assumed that an allocation of a local-memory tensor does not
//! need to be accounted for an parallelied IterDomain. This is true
//! when it is guaranteed that each thread only needs to execute the
//! loop body once. However, if not, the allocation is invalid as it
//! only has a space for one value per thread.
//!
//! ValidateAllocation checks all tensor allocations and sees if any
//! tensor may have a parallelized loop whose iteration count may
//! be larger than the number of threads. If so, an error is thrown if
//! the tensor is not allocated on thread-shared memories. Note that
//! when allocated on a shared memory (i.e., MemoryType::Shared or
//! MemoryType::Global for tensors parallelized with threadIdx, or
//! MemoryType::Global for tensors parallelized with blockIdx), it is
//! assumed that allocation is properly extended for the iteration
//! count.
class ValidateAllocation : private OptOutConstDispatch {
public:
static void validate(const Kernel* kernel) {
ValidateAllocation validate_allocation(kernel);
}
private:
explicit ValidateAllocation(const Kernel* kernel) {
live_allocations_.emplace_back(std::vector<const Allocate*>());
for (const auto& expr : kernel->topLevelExprs()) {
OptOutConstDispatch::handle(expr);
}
live_allocations_.pop_back();
TORCH_INTERNAL_ASSERT(live_allocations_.empty());
}
void handle(const Allocate* allocate) final {
TORCH_INTERNAL_ASSERT(!live_allocations_.empty());
live_allocations_.back().push_back(allocate);
}
// for_loop is parallelized and its stop value is not guaranteed to
// be <= the number of threads, which breaks an assumption made
// during in the allocation lowering if it's thread-parallel and not
// allocated on shared or global memories, or if it's block-parallel
// ando not allocated on global memory.
void validate(const ForLoop* for_loop) {
const auto loop_id = for_loop->iter_domain();
for (const auto& allocations : live_allocations_) {
for (const auto& allocate : allocations) {
const auto tv = dynamic_cast<TensorView*>(allocate->buffer());
if (tv == nullptr) {
continue;
}
for (const auto& axis : tv->domain()->domain()) {
if (!GpuLower::current()->caMap()->areMapped(
loop_id, axis, IdMappingMode::LOOP)) {
continue;
}
if (isParallelTypeThreadDim(loop_id->getParallelType())) {
TORCH_INTERNAL_ASSERT(
tv->getMemoryType() == MemoryType::Shared ||
tv->getMemoryType() == MemoryType::Global,
"Tensor t",
tv->name(),
" must be allocated on SMEM or GMEM.");
} else if (isParallelTypeBlockDim(loop_id->getParallelType())) {
TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global);
}
}
}
}
}
void handle(const ForLoop* for_loop) final {
if (for_loop->stop() != for_loop->iter_domain()->extent() &&
isParallelTypeThread(for_loop->iter_domain()->getParallelType())) {
validate(for_loop);
}
live_allocations_.emplace_back(std::vector<const Allocate*>());
for (const auto& expr : for_loop->body().exprs()) {
OptOutConstDispatch::handle(expr);
}
live_allocations_.pop_back();
}
void handle(const IfThenElse* ite) final {
for (const auto& expr : ite->thenBody().exprs()) {
OptOutConstDispatch::handle(expr);
}
for (const auto& expr : ite->elseBody().exprs()) {
OptOutConstDispatch::handle(expr);
}
}
private:
std::vector<std::vector<const Allocate*>> live_allocations_;
};
} // namespace
// TODO(kir): Kernel IR validation
void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
top_level_exprs_ = std::move(top_level_exprs);
warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
profile_ = GpuLower::current()->profile();
ValidateAllocation::validate(this);
analyze();
// Make sure this is after analyze as it sets summary_
summary_.vectorized_accesses = GpuLower::current()->vectorizedAccesses();
summary_.vectorized_set_info = GpuLower::current()->vectorizedSetInfo();
summary_.sync_map = GpuLower::current()->syncMap();
summary_.parallel_dimension_map_ =
GpuLower::current()->parallelDimensionMap();
}
void Kernel::analyze() {
FUSER_PERF_SCOPE("Kernel::analyze");
const KernelIrScanner ir_scanner(this);
summary_ = ir_scanner.summary();
}
void Kernel::print() const {
IrPrinter ir_printer(std::cout);
ir_printer.handle(this);
}
//! Register the Val with this fusion
void Kernel::registerVal(Val* val) {
if (inContainer(val)) {
return;
}
if (val->kernel()) {
TORCH_CHECK(
val->kernel() == this,
val->toString(),
" was not found in the active kernel.");
}
Fusion::registerVal(val);
}
//! Register expr with this fusion.
//! When we register an expression, we want to update the dependency tracking
//! of Vals. We add expr to our general expr_set_,
void Kernel::registerExpr(Expr* expr) {
if (inContainer(expr)) {
return;
}
if (expr->kernel()) {
TORCH_CHECK(
expr->kernel() == this,
expr->toString(),
" was not found in the active kernel.");
}
for (Val* input : expr->inputs()) {
TORCH_INTERNAL_ASSERT(
inContainer(input),
"Input\n",
input->toString(),
" to expr,\n",
expr->toString(),
",\n is invalid because it is not in the same kernel.");
}
for (Val* output : expr->outputs()) {
TORCH_INTERNAL_ASSERT(
inContainer(output),
"Output\n",
output->toString(),
" to expr,\n",
expr->toString(),
",\n is invalid because it is not in the same kernel.");
}
// Register expr is explicitly non-SSA when coming from a kernel. This is
// detected inside Fusion::registerExpr
Fusion::registerExpr(expr);
}
std::vector<Expr*>& KernelInternalProxy::topLevelExprs() {
return kernel_->top_level_exprs_;
}
void KernelPerformanceProfile::registerExpr(const Expr* expr) {
if (expr_entry_map_.find(expr) != expr_entry_map_.end()) {
return;
}
auto slot = getNewIndex();
expr_entry_map_.emplace(expr, slot);
}
int KernelPerformanceProfile::getNewIndex() {
return num_profile_entries_++;
}
bool KernelPerformanceProfile::isProfiled(const Expr* expr) const {
return expr_entry_map_.find(expr) != expr_entry_map_.end();
}
c10::optional<int> KernelPerformanceProfile::getIndex(const Expr* expr) const {
auto it = expr_entry_map_.find(expr);
if (it == expr_entry_map_.end()) {
return c10::optional<int>();
} else {
return it->second;
}
}
std::array<int, 2> KernelPerformanceProfile::getIndicesInProfileBuffer(
const Expr* expr) const {
TORCH_INTERNAL_ASSERT(
isProfiled(expr), "Not a profiled expression: ", expr->toString());
int cycle_index = getIndex(expr).value() * 2;
int count_index = cycle_index + 1;
return {cycle_index, count_index};
}
std::string KernelPerformanceProfile::toString(const at::Tensor& buffer) const {
std::stringstream ss;
ss << "Kernel performance profile:\n";
if (!buffer.defined()) {
ss << "No profile found\n";
return ss.str();
}
double kilo_freq = at::cuda::getCurrentDeviceProperties()->clockRate;
ss << std::setprecision(3) << std::fixed;
for (const auto& kv : expr_entry_map_) {
auto expr = kv.first;
auto index = kv.second;
auto out_tv = ir_utils::getTvOutput(expr);
double cycles = static_cast<double>(buffer[index][0].item<int64_t>());
auto count = buffer[index][1].item<int64_t>();
auto cycles_per_call = count == 0 ? 0.0 : cycles / count;
auto us_per_call = cycles_per_call / kilo_freq * 1000.0;
ss << expr->getExprType().value() << ", T" << out_tv->name() << ", "
<< us_per_call << " us, " << count << "\n";
}
return ss.str();
}
} // namespace kir
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch