mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/ Code changes includes: - codegen improvements: 1. double support in expression evaluator - bug fixes: 1. dropout fix - rework RNG to support broadcasted dropout (Fixes #82784) 2. expand fix - Patch expand+reduction, expand+view, rework view analysis and guard - scheduler: 1. manual transpose schedule example 2. WIP transpose scheduler Commits that's in this PR from the devel branch: ``` b7435afcd22c917713c2f41a7237bc26e1183f14 Transpose scheduler, step 1 (#1854) 8a45dbf72034684eb8e18b1835b533e90b68f184 Add an example on how to manually schedule transpose (#1889) 83dbf56a9554b2efbd5416461d938fff477b0b27 Patch dropout fix (#1898) 69d3519a532250719b1aa8341b50e067b181b42d Expand+Reduction, Expand+View support, rework View analysis and guards (#1883) 15091c488e96343bdc49e3990acbf238a3b3da51 Rework RNG to correctly support broadcasted dropout (#1888) aafe2d048aaac596e503596a41303423619f3954 Make ExpressionEvaluator support Double (#1885) ``` RUN_TORCHBENCH: nvfuser Differential Revision: [D38657074](https://our.internmc.facebook.com/intern/diff/D38657074) Pull Request resolved: https://github.com/pytorch/pytorch/pull/83239 Approved by: https://github.com/davidberard98
258 lines
7.4 KiB
C++
258 lines
7.4 KiB
C++
#pragma once
|
|
|
|
#include <c10/macros/Export.h>
|
|
|
|
#include <torch/csrc/jit/codegen/cuda/fusion.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
|
|
#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
|
|
#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
|
|
#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
|
|
#include <torch/csrc/jit/codegen/cuda/utils.h>
|
|
#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
|
|
|
|
#include <memory>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace cuda {
|
|
namespace kir {
|
|
|
|
//! Summary of interesting facts about the kernel
|
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
|
struct KernelSummary {
|
|
//! Count of WAR (write-after-read) hazard barriers
|
|
int war_hazard_syncs_count = 0;
|
|
|
|
//! List of global buffers
|
|
std::vector<const kir::Allocate*> global_allocations;
|
|
|
|
//! List of dynamic shared memory buffers
|
|
std::vector<const kir::Allocate*> dynamic_smem_allocations;
|
|
|
|
//! List of static shared memory buffers
|
|
std::vector<const kir::Allocate*> static_smem_allocations;
|
|
|
|
//! Indicate the need to generate random numbers
|
|
int max_rng_offsets = -1;
|
|
|
|
//! Do we have any block reductions?
|
|
bool has_block_reductions = false;
|
|
|
|
//! Number of static grid reductions
|
|
bool has_grid_reductions = false;
|
|
|
|
//! Do we have any grid reduction in a loop, or grid reductions dependent on
|
|
//! grid reductions
|
|
bool has_cooperative_grid_reduction = false;
|
|
|
|
//! Do we have any block broadcasts?
|
|
bool has_block_broadcasts = false;
|
|
|
|
//! Do we have any grid broadcasts?
|
|
bool has_grid_broadcasts = false;
|
|
|
|
//! Do we have any welford op?
|
|
bool has_welford = false;
|
|
|
|
//! Do we have any welford op?
|
|
bool has_block_welford = false;
|
|
|
|
//! Do we have any welford op?
|
|
bool has_grid_welford = false;
|
|
|
|
//! Largest shared memory buffer base type
|
|
DataType largest_smem_data_type = DataType::Null;
|
|
|
|
//! Do we have allocations of dynamic local memory?
|
|
bool has_dynamic_local_memory_allocations = false;
|
|
|
|
//! List of dynamic local memory buffers.
|
|
//! Only used for debugging.
|
|
std::vector<const kir::Allocate*> dynamic_lmem_allocations;
|
|
|
|
//! ceilDiv extents that must be divisible
|
|
std::vector<std::pair<const Val*, const Val*>> splits_to_validate;
|
|
|
|
//! Effective ParallelTypes of broadcast ops
|
|
std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
|
|
broadcast_parallel_types;
|
|
|
|
//! Track which tensor views are inputs or outputs of a vectorized operation
|
|
//! and their maximum vectorized access size
|
|
std::unordered_map<TensorView*, int> vectorized_accesses;
|
|
|
|
// Sync map is needed to figure out if global memory buffers need to be marked
|
|
// as volatile because they're used for communication.
|
|
SyncMap sync_map;
|
|
|
|
// Parallel dimension map needed to set the correct properties of grid buffers
|
|
// (is a dim inactive)
|
|
ParallelDimensionMap parallel_dimension_map_;
|
|
|
|
//! Track information on vectorized set operations for runtime validation
|
|
std::vector<VectorizedSetInfo> vectorized_set_info;
|
|
};
|
|
|
|
class TORCH_CUDA_CU_API KernelPerformanceProfile {
|
|
public:
|
|
//! Register an expression to profile
|
|
void registerExpr(const Expr* expr);
|
|
|
|
//! Query if an expression is profiled
|
|
bool isProfiled(const Expr* expr) const;
|
|
|
|
//! Get the number of profiled expressions
|
|
int getNumberOfProfileEntries() const {
|
|
return num_profile_entries_;
|
|
}
|
|
|
|
//! Set the backing buffer of profile.
|
|
void setBuffer(TensorView* buffer) {
|
|
buffer_ = buffer;
|
|
}
|
|
|
|
//! Get the backing buffer
|
|
TensorView* getBuffer() const {
|
|
return buffer_;
|
|
}
|
|
|
|
//! Get the indices of the profile of an expression in the backing buffer
|
|
std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;
|
|
|
|
std::string toString(const at::Tensor& buffer) const;
|
|
|
|
private:
|
|
//! Get the new profile index
|
|
int getNewIndex();
|
|
|
|
//! Get the profile index
|
|
c10::optional<int> getIndex(const Expr* expr) const;
|
|
|
|
private:
|
|
int num_profile_entries_ = 0;
|
|
|
|
//! Backing buffer of Nx2 integer tensor, where N is the number of profiled
|
|
//! regions. Each region has two integer values, one representing
|
|
//! the cycles spent, and another the count.
|
|
TensorView* buffer_ = nullptr;
|
|
|
|
//! Map profiled expressions to profile entry offsets
|
|
std::unordered_map<const Expr*, int> expr_entry_map_;
|
|
|
|
// TODO: Allow profiling of ForLoops
|
|
//! Map profiled ForLoop to profile entry offsets
|
|
// std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
|
|
};
|
|
|
|
class KernelInternalProxy;
|
|
|
|
//! Container for a lowered Kernel IR
|
|
//!
|
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
|
class TORCH_CUDA_CU_API Kernel final : public Fusion {
|
|
friend KernelInternalProxy;
|
|
|
|
public:
|
|
// Kernel starts by grabbing all the nodes from the provided fusion.
|
|
// Kernel is not SSA, if a definition is not set, we should update it, but
|
|
// not remove previous definition if it is set. This is primarily because when
|
|
// we do something like generate an initialization statement for a reduction
|
|
// TV, we may want to continue to do fusion like analysis on the original
|
|
// expression.
|
|
// TODO: Assert index type is int or int32
|
|
Kernel(Fusion* fusion, DataType index_type = DataType::Int)
|
|
: Fusion(*fusion), index_type_(index_type) {}
|
|
|
|
Kernel() = delete;
|
|
|
|
// No move or copy semantics
|
|
Kernel(const Kernel&) = delete;
|
|
Kernel& operator=(const Kernel&) = delete;
|
|
|
|
//! Finalize a kernel definition
|
|
//!
|
|
//! At this point we have a complete kernel definition and we can
|
|
//! run analysis passes to build a KernelSummary.
|
|
void finalize(std::vector<Expr*> top_level_exprs);
|
|
|
|
const std::vector<Expr*>& topLevelExprs() const {
|
|
return top_level_exprs_;
|
|
}
|
|
|
|
const KernelSummary& summary() const {
|
|
return summary_;
|
|
}
|
|
|
|
DataType indexType() const {
|
|
return index_type_;
|
|
}
|
|
|
|
//! Checks if parallel type is padded
|
|
bool isParallelTypePadded(ParallelType ptype) const {
|
|
return ptype == ParallelType::TIDx &&
|
|
warp_padded_parallel_info_.is_tidx_padded;
|
|
}
|
|
|
|
const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const {
|
|
return warp_padded_parallel_info_;
|
|
}
|
|
|
|
const KernelPerformanceProfile& profile() const {
|
|
return profile_;
|
|
}
|
|
|
|
//! Debug dump of the Kernel IR
|
|
void print() const;
|
|
|
|
protected:
|
|
//! Register the Val with this fusion
|
|
void registerVal(Val* val) override;
|
|
|
|
//! Register expr with this fusion.
|
|
//! When we register an expression, we want to update the dependency tracking
|
|
//! of Vals. We add expr to our general expr_set_,
|
|
void registerExpr(Expr* expr) override;
|
|
|
|
private:
|
|
// Analyze the kernel IR and caches the summary of interesting data
|
|
void analyze();
|
|
|
|
// Top level statements
|
|
std::vector<Expr*> top_level_exprs_;
|
|
|
|
// Summary of interesting kernel data
|
|
KernelSummary summary_;
|
|
|
|
// Is this kernel being compiled with int32 or int64 indexing. This
|
|
// information is required to resolve DataType::Index
|
|
DataType index_type_ = DataType::Int;
|
|
|
|
WarpPaddedParallelInfo warp_padded_parallel_info_;
|
|
|
|
KernelPerformanceProfile profile_;
|
|
};
|
|
|
|
//! A special debugging proxy for Kernel.
|
|
//!
|
|
//! Should not be used for other than testing and debugging.
|
|
class TORCH_CUDA_CU_API KernelInternalProxy {
|
|
public:
|
|
KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}
|
|
|
|
std::vector<Expr*>& topLevelExprs();
|
|
|
|
private:
|
|
Kernel* kernel_ = nullptr;
|
|
};
|
|
|
|
} // namespace kir
|
|
} // namespace cuda
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|