mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary:
Things changed in this PR that requires review:
1. aten/src/ATen/core/interned_strings.h
2. torch/csrc/jit/ir/alias_analysis.h : exposing createValue to allow efficient mutation
3. torch/csrc/jit/runtime/symbolic_shape_registry.cpp : added gelu/tanh/erf in registry
4. torch/jit/_script.py : throws scripting model sees autocast as decorator since it's not supported
nvfuser code update:
1. codegen improvements and performance tuning
2. integration bug fixes for shape expression logic
3. kernel segmentation update to address perf regression from horizontal fusion
4. scalar cpu tensor promotion to support inter-device operation between cpu scalar tensor and cuda tensor
Things reverted from local changes:
aten::gelu with approximation (tracked in PR: https://github.com/pytorch/pytorch/pull/61439)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72127
Reviewed By: HamidShojanazeri
Differential Revision: D34113233
Pulled By: jbschlosser
fbshipit-source-id: b82cde32b71e324eca0ea57cb8c9f9647278ca74
(cherry picked from commit e009bc5c4e)
177 lines
6.1 KiB
C++
177 lines
6.1 KiB
C++
#include "utils.h"
|
|
|
|
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
|
|
|
|
#include <sstream>
|
|
|
|
using namespace torch::jit::fuser::cuda;
|
|
|
|
std::string toString(ReductionParams rparams) {
|
|
std::stringstream ss;
|
|
ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ")
|
|
<< (rparams.persistent_kernel ? "Persistent Kernel // " : "")
|
|
<< (rparams.project_persistent_buffers ? "Project Persistent Buffers // "
|
|
: "");
|
|
|
|
if (rparams.schedule_3D) {
|
|
ss << "3D Schedule // "
|
|
<< "Outer Reduction: "
|
|
<< (rparams.cross_block_outer_reduction ? "cross block / " : "")
|
|
<< (rparams.cross_grid_outer_reduction ? "cross grid / " : "")
|
|
<< (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : "");
|
|
if (rparams.batches_per_block_outer_reduction > 1 ||
|
|
rparams.persistent_kernel) {
|
|
ss << "persistent batch - " << rparams.batches_per_block_outer_reduction
|
|
<< " / ";
|
|
}
|
|
}
|
|
|
|
ss << " // Iteration Domain: "
|
|
<< (rparams.multiple_reds_per_blk ? "multiple reductions per block / "
|
|
: "")
|
|
<< (rparams.split_grid_dim_iter_dom ? "split grid dimension / " : "")
|
|
<< (rparams.vectorize_iter_dom ? "vectorize / " : "")
|
|
<< (rparams.unroll_iter_dom && !rparams.vectorize_iter_dom ? "unroll / "
|
|
: "");
|
|
if (rparams.unroll_iter_dom || rparams.vectorize_iter_dom) {
|
|
ss << "factor " << rparams.unroll_factor_iter_dom;
|
|
}
|
|
|
|
ss << " // Inner Reduction Domain: "
|
|
<< (rparams.cross_block_inner_reduction ? "cross block reduction / " : "")
|
|
<< (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "")
|
|
<< (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : "");
|
|
|
|
if (rparams.batches_per_block_inner_reduction > 1 ||
|
|
rparams.persistent_kernel) {
|
|
ss << "persistent batch - " << rparams.batches_per_block_inner_reduction
|
|
<< " / ";
|
|
}
|
|
|
|
ss << (rparams.cross_grid_inner_reduction &&
|
|
rparams.split_grid_dim_inner_reduction
|
|
? "split grid dimension / "
|
|
: "")
|
|
<< (rparams.vectorize_inner_reduction ? "vectorize / " : "")
|
|
<< (rparams.unroll_inner_reduction && !rparams.vectorize_inner_reduction
|
|
? "unroll / "
|
|
: "");
|
|
if (rparams.unroll_inner_reduction || rparams.vectorize_inner_reduction) {
|
|
ss << "factor " << rparams.unroll_factor_inner_reduction;
|
|
}
|
|
return ss.str();
|
|
}
|
|
|
|
std::string toString(PointwiseParams params) {
|
|
std::stringstream ss;
|
|
if (params.break_point) {
|
|
ss << "2D Schedule at " << params.break_point << "/";
|
|
if (params.split_block) {
|
|
ss << " Split block into y-dim/";
|
|
}
|
|
if (params.split_grid_y_dim) {
|
|
ss << " Split y grid dim/";
|
|
}
|
|
} else {
|
|
ss << "1D"
|
|
<< "/";
|
|
}
|
|
if (params.inner_factor > 1) {
|
|
if (params.vectorize) {
|
|
ss << "Vectorize, Factor: " << params.inner_factor;
|
|
} else {
|
|
ss << "Unroll, Factor: " << params.inner_factor;
|
|
}
|
|
}
|
|
return ss.str();
|
|
}
|
|
|
|
std::string toString(LaunchParams lparams) {
|
|
std::stringstream ss;
|
|
lparams.toString();
|
|
ss << "/Launch_Parameters["
|
|
<< "block(" << lparams.bdimz() << "/" << lparams.bdimy() << "/"
|
|
<< lparams.bdimx() << ")/grid(" << lparams.gdimz() << "/"
|
|
<< lparams.gdimy() << "/" << lparams.gdimx() << ")/" << lparams.smem()
|
|
<< "]";
|
|
return ss.str();
|
|
}
|
|
|
|
void clearL2Cache() {
|
|
torch::NoGradGuard no_grad;
|
|
auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize;
|
|
auto options =
|
|
torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0);
|
|
|
|
auto l2_elems = l2_cache_size / 4;
|
|
torch::Tensor t0 = torch::empty(l2_elems, options);
|
|
torch::Tensor t1 = torch::clone(t0);
|
|
};
|
|
|
|
TensorView* makeContigTensor(size_t ndims, DataType dtype) {
|
|
return TensorViewBuilder()
|
|
.ndims(ndims)
|
|
.dtype(dtype)
|
|
.contiguity(std::vector<bool>(ndims, true))
|
|
.build();
|
|
}
|
|
|
|
void runBenchmarkIterations(
|
|
benchmark::State& benchmark_state,
|
|
FusionExecutorCache* fusion_executor_cache,
|
|
std::vector<c10::IValue>& aten_inputs) {
|
|
fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
bool segmented =
|
|
fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented();
|
|
|
|
if (!segmented) {
|
|
fusion_executor_cache->profile(true);
|
|
fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
|
|
auto executor_instance = compile_log.fusion_executor;
|
|
TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value());
|
|
TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value());
|
|
auto rparams = toString(compile_log.reduction_params.value());
|
|
auto lparams = toString(compile_log.launch_constraints.value());
|
|
benchmark_state.SetLabel(rparams + lparams);
|
|
executor_instance->setMeasureKernelTimeFlag(true);
|
|
|
|
// Sync everything up before we start
|
|
cudaDeviceSynchronize();
|
|
for (auto _ : benchmark_state) {
|
|
clearL2Cache();
|
|
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
benchmark_state.SetIterationTime(
|
|
executor_instance->kernelTimeMs() / 1000.0);
|
|
}
|
|
// Sync everything up before we're finished, don't want to run ahead on the
|
|
// cpu while benchmarking.
|
|
cudaDeviceSynchronize();
|
|
} else {
|
|
// Segmented
|
|
// Sync everything up before we start
|
|
{
|
|
// Compile/warmup
|
|
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
}
|
|
cudaDeviceSynchronize();
|
|
CudaKernelTimer timer;
|
|
for (auto _ : benchmark_state) {
|
|
clearL2Cache();
|
|
timer.restart();
|
|
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
|
|
}
|
|
// Sync everything up before we're finished, don't want to run ahead on the
|
|
// cpu while benchmarking.
|
|
cudaDeviceSynchronize();
|
|
}
|
|
}
|
|
|
|
namespace executorCache {
|
|
thread_local ExecutorMap executor_map_;
|
|
ExecutorMap& getGlobalMap() {
|
|
return executor_map_;
|
|
}
|
|
} // namespace executorCache
|