pytorch/torch/csrc/jit/codegen/cuda/kernel.h
jjsjann123 b21a6ff639 [NVFuser] Upstream push 0811 (#83239)
Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/

Code changes includes:

- codegen improvements:
  1. double support in expression evaluator
- bug fixes:
  1. dropout fix - rework RNG to support broadcasted dropout (Fixes #82784)
  2. expand fix - Patch expand+reduction, expand+view, rework view analysis and guard
- scheduler:
  1. manual transpose schedule example
  2. WIP transpose scheduler

Commits that's in this PR from the devel branch:

```
b7435afcd22c917713c2f41a7237bc26e1183f14 Transpose scheduler, step 1 (#1854)
8a45dbf72034684eb8e18b1835b533e90b68f184 Add an example on how to manually schedule transpose (#1889)
83dbf56a9554b2efbd5416461d938fff477b0b27 Patch dropout fix (#1898)
69d3519a532250719b1aa8341b50e067b181b42d Expand+Reduction, Expand+View support, rework View analysis and guards (#1883)
15091c488e96343bdc49e3990acbf238a3b3da51 Rework RNG to correctly support broadcasted dropout (#1888)
aafe2d048aaac596e503596a41303423619f3954 Make ExpressionEvaluator support Double (#1885)
```

RUN_TORCHBENCH: nvfuser

Differential Revision: [D38657074](https://our.internmc.facebook.com/intern/diff/D38657074)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83239
Approved by: https://github.com/davidberard98
2022-08-25 02:23:22 +00:00

258 lines
7.4 KiB
C++

#pragma once
#include <c10/macros/Export.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
#include <torch/csrc/jit/codegen/cuda/utils.h>
#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace kir {
//! Summary of interesting facts about the kernel
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct KernelSummary {
//! Count of WAR (write-after-read) hazard barriers
int war_hazard_syncs_count = 0;
//! List of global buffers
std::vector<const kir::Allocate*> global_allocations;
//! List of dynamic shared memory buffers
std::vector<const kir::Allocate*> dynamic_smem_allocations;
//! List of static shared memory buffers
std::vector<const kir::Allocate*> static_smem_allocations;
//! Indicate the need to generate random numbers
int max_rng_offsets = -1;
//! Do we have any block reductions?
bool has_block_reductions = false;
//! Number of static grid reductions
bool has_grid_reductions = false;
//! Do we have any grid reduction in a loop, or grid reductions dependent on
//! grid reductions
bool has_cooperative_grid_reduction = false;
//! Do we have any block broadcasts?
bool has_block_broadcasts = false;
//! Do we have any grid broadcasts?
bool has_grid_broadcasts = false;
//! Do we have any welford op?
bool has_welford = false;
//! Do we have any welford op?
bool has_block_welford = false;
//! Do we have any welford op?
bool has_grid_welford = false;
//! Largest shared memory buffer base type
DataType largest_smem_data_type = DataType::Null;
//! Do we have allocations of dynamic local memory?
bool has_dynamic_local_memory_allocations = false;
//! List of dynamic local memory buffers.
//! Only used for debugging.
std::vector<const kir::Allocate*> dynamic_lmem_allocations;
//! ceilDiv extents that must be divisible
std::vector<std::pair<const Val*, const Val*>> splits_to_validate;
//! Effective ParallelTypes of broadcast ops
std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
broadcast_parallel_types;
//! Track which tensor views are inputs or outputs of a vectorized operation
//! and their maximum vectorized access size
std::unordered_map<TensorView*, int> vectorized_accesses;
// Sync map is needed to figure out if global memory buffers need to be marked
// as volatile because they're used for communication.
SyncMap sync_map;
// Parallel dimension map needed to set the correct properties of grid buffers
// (is a dim inactive)
ParallelDimensionMap parallel_dimension_map_;
//! Track information on vectorized set operations for runtime validation
std::vector<VectorizedSetInfo> vectorized_set_info;
};
class TORCH_CUDA_CU_API KernelPerformanceProfile {
public:
//! Register an expression to profile
void registerExpr(const Expr* expr);
//! Query if an expression is profiled
bool isProfiled(const Expr* expr) const;
//! Get the number of profiled expressions
int getNumberOfProfileEntries() const {
return num_profile_entries_;
}
//! Set the backing buffer of profile.
void setBuffer(TensorView* buffer) {
buffer_ = buffer;
}
//! Get the backing buffer
TensorView* getBuffer() const {
return buffer_;
}
//! Get the indices of the profile of an expression in the backing buffer
std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;
std::string toString(const at::Tensor& buffer) const;
private:
//! Get the new profile index
int getNewIndex();
//! Get the profile index
c10::optional<int> getIndex(const Expr* expr) const;
private:
int num_profile_entries_ = 0;
//! Backing buffer of Nx2 integer tensor, where N is the number of profiled
//! regions. Each region has two integer values, one representing
//! the cycles spent, and another the count.
TensorView* buffer_ = nullptr;
//! Map profiled expressions to profile entry offsets
std::unordered_map<const Expr*, int> expr_entry_map_;
// TODO: Allow profiling of ForLoops
//! Map profiled ForLoop to profile entry offsets
// std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
};
class KernelInternalProxy;
//! Container for a lowered Kernel IR
//!
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API Kernel final : public Fusion {
friend KernelInternalProxy;
public:
// Kernel starts by grabbing all the nodes from the provided fusion.
// Kernel is not SSA, if a definition is not set, we should update it, but
// not remove previous definition if it is set. This is primarily because when
// we do something like generate an initialization statement for a reduction
// TV, we may want to continue to do fusion like analysis on the original
// expression.
// TODO: Assert index type is int or int32
Kernel(Fusion* fusion, DataType index_type = DataType::Int)
: Fusion(*fusion), index_type_(index_type) {}
Kernel() = delete;
// No move or copy semantics
Kernel(const Kernel&) = delete;
Kernel& operator=(const Kernel&) = delete;
//! Finalize a kernel definition
//!
//! At this point we have a complete kernel definition and we can
//! run analysis passes to build a KernelSummary.
void finalize(std::vector<Expr*> top_level_exprs);
const std::vector<Expr*>& topLevelExprs() const {
return top_level_exprs_;
}
const KernelSummary& summary() const {
return summary_;
}
DataType indexType() const {
return index_type_;
}
//! Checks if parallel type is padded
bool isParallelTypePadded(ParallelType ptype) const {
return ptype == ParallelType::TIDx &&
warp_padded_parallel_info_.is_tidx_padded;
}
const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const {
return warp_padded_parallel_info_;
}
const KernelPerformanceProfile& profile() const {
return profile_;
}
//! Debug dump of the Kernel IR
void print() const;
protected:
//! Register the Val with this fusion
void registerVal(Val* val) override;
//! Register expr with this fusion.
//! When we register an expression, we want to update the dependency tracking
//! of Vals. We add expr to our general expr_set_,
void registerExpr(Expr* expr) override;
private:
// Analyze the kernel IR and caches the summary of interesting data
void analyze();
// Top level statements
std::vector<Expr*> top_level_exprs_;
// Summary of interesting kernel data
KernelSummary summary_;
// Is this kernel being compiled with int32 or int64 indexing. This
// information is required to resolve DataType::Index
DataType index_type_ = DataType::Int;
WarpPaddedParallelInfo warp_padded_parallel_info_;
KernelPerformanceProfile profile_;
};
//! A special debugging proxy for Kernel.
//!
//! Should not be used for other than testing and debugging.
class TORCH_CUDA_CU_API KernelInternalProxy {
public:
KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}
std::vector<Expr*>& topLevelExprs();
private:
Kernel* kernel_ = nullptr;
};
} // namespace kir
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch