mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary:
Things changed in this PR that requires review:
1. aten/src/ATen/core/interned_strings.h
2. torch/csrc/jit/ir/alias_analysis.h : exposing createValue to allow efficient mutation
3. torch/csrc/jit/runtime/symbolic_shape_registry.cpp : added gelu/tanh/erf in registry
4. torch/jit/_script.py : throws scripting model sees autocast as decorator since it's not supported
nvfuser code update:
1. codegen improvements and performance tuning
2. integration bug fixes for shape expression logic
3. kernel segmentation update to address perf regression from horizontal fusion
4. scalar cpu tensor promotion to support inter-device operation between cpu scalar tensor and cuda tensor
Things reverted from local changes:
aten::gelu with approximation (tracked in PR: https://github.com/pytorch/pytorch/pull/61439)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72127
Reviewed By: HamidShojanazeri
Differential Revision: D34113233
Pulled By: jbschlosser
fbshipit-source-id: b82cde32b71e324eca0ea57cb8c9f9647278ca74
(cherry picked from commit e009bc5c4e)
224 lines
7.0 KiB
C++
224 lines
7.0 KiB
C++
#include <torch/csrc/jit/codegen/cuda/executor.h>
|
|
#include <torch/csrc/jit/codegen/cuda/fusion.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
|
|
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
|
|
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
|
|
|
|
#include <benchmark/benchmark.h>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include "utils.h"
|
|
|
|
using namespace torch::jit::fuser::cuda;
|
|
|
|
namespace {
|
|
|
|
// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
|
|
// but unknown sizes
|
|
TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
|
|
return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
|
|
}
|
|
|
|
// Make a non-contiguous tensor of compile-time known sizes
|
|
TensorView* makeConcreteTensor(
|
|
std::vector<int64_t> shape,
|
|
DataType dtype = DataType::Float) {
|
|
return TensorViewBuilder().shape(shape).dtype(dtype).build();
|
|
}
|
|
|
|
} // namespace
|
|
|
|
static auto getLayerBackwardNormRuntime(
|
|
std::unique_ptr<Fusion> fusion_ptr,
|
|
std::unique_ptr<FusionExecutorCache>& fec,
|
|
std::vector<at::IValue>& aten_inputs,
|
|
std::vector<int64_t>& shape,
|
|
std::vector<int64_t>& norm_shape) {
|
|
Fusion& fusion = *fusion_ptr.get();
|
|
|
|
const size_t kM = shape.size();
|
|
const size_t kN = norm_shape.size();
|
|
const size_t kOuterNumDims = kM - kN;
|
|
|
|
std::vector<int64_t> outer_shape;
|
|
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
|
|
outer_shape.push_back(shape[idx]);
|
|
}
|
|
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
|
|
outer_shape.push_back(1);
|
|
}
|
|
|
|
auto grad_out = makeSymbolicTensor(shape.size());
|
|
auto input = makeSymbolicTensor(shape.size());
|
|
auto mean = makeConcreteTensor(outer_shape);
|
|
auto rstd = makeConcreteTensor(outer_shape);
|
|
auto weight = makeSymbolicTensor(norm_shape.size());
|
|
auto bias = makeSymbolicTensor(norm_shape.size());
|
|
fusion.addInput(grad_out);
|
|
fusion.addInput(input);
|
|
fusion.addInput(mean);
|
|
fusion.addInput(rstd);
|
|
fusion.addInput(weight);
|
|
fusion.addInput(bias);
|
|
|
|
auto grads = layer_norm_backward(
|
|
grad_out,
|
|
input,
|
|
norm_shape,
|
|
mean,
|
|
rstd,
|
|
weight,
|
|
bias,
|
|
{true, true, true});
|
|
|
|
fusion.addOutput(grads.grad_input);
|
|
fusion.addOutput(grads.grad_weight);
|
|
fusion.addOutput(grads.grad_bias);
|
|
|
|
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
|
|
at::Tensor aten_grad_out = at::randn(shape, options);
|
|
at::Tensor aten_input = at::randn(shape, options);
|
|
at::Tensor aten_weight = at::randn(norm_shape, options);
|
|
at::Tensor aten_bias = at::randn(norm_shape, options);
|
|
auto at_weight = c10::optional<at::Tensor>(aten_weight);
|
|
auto at_bias = c10::optional<at::Tensor>(aten_bias);
|
|
|
|
const float kEps = 1e-5;
|
|
auto aten_results =
|
|
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
|
|
auto aten_output = std::get<0>(aten_results);
|
|
auto aten_mean = std::get<1>(aten_results);
|
|
auto aten_rstd = std::get<2>(aten_results);
|
|
|
|
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
|
|
aten_inputs = {
|
|
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
|
|
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
|
|
|
|
return fec->getMostRecentKernelRuntime();
|
|
}
|
|
|
|
void LayerNormBackward_ShapeInference_Base(
|
|
benchmark::State& benchmark_state,
|
|
bool disable_launch_parameter_cache) {
|
|
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
|
|
FusionGuard fg(fusion_ptr.get());
|
|
|
|
// PreAllocate
|
|
std::unique_ptr<FusionExecutorCache> fec;
|
|
std::vector<at::IValue> aten_inputs;
|
|
|
|
std::vector<int64_t> shape{20, 100, 35, 67};
|
|
std::vector<int64_t> norm_shape{67};
|
|
|
|
auto runtime = getLayerBackwardNormRuntime(
|
|
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
|
|
TORCH_INTERNAL_ASSERT(
|
|
runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
|
|
|
|
fec->profile(true);
|
|
fec->disableKernelLaunch();
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
if (disable_launch_parameter_cache) {
|
|
fec->disableLaunchParamCache();
|
|
}
|
|
|
|
for (auto _ : benchmark_state) {
|
|
// Setup (not included in the measurement)
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
}
|
|
}
|
|
|
|
static void LayerNormBackward_ShapeInference(
|
|
benchmark::State& benchmark_state) {
|
|
LayerNormBackward_ShapeInference_Base(benchmark_state, true);
|
|
}
|
|
|
|
static void LayerNormBackward_NoShapeInferenceCachedBaseline(
|
|
benchmark::State& benchmark_state) {
|
|
LayerNormBackward_ShapeInference_Base(benchmark_state, false);
|
|
}
|
|
|
|
static auto getLayerForwardNormRuntime(
|
|
std::unique_ptr<Fusion> fusion_ptr,
|
|
std::unique_ptr<FusionExecutorCache>& fec,
|
|
std::vector<at::IValue>& aten_inputs,
|
|
std::vector<int64_t>& shape,
|
|
std::vector<int64_t>& norm_shape) {
|
|
Fusion& fusion = *fusion_ptr.get();
|
|
|
|
const float kEps = 1e-5;
|
|
Double* eps_ptr = IrBuilder::create<Double>(kEps);
|
|
|
|
auto input = makeSymbolicTensor(shape.size());
|
|
fusion.addInput(input);
|
|
|
|
auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
|
|
|
|
fusion.addOutput(result.output);
|
|
fusion.addOutput(result.mean);
|
|
fusion.addOutput(result.invstd);
|
|
|
|
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
|
|
at::Tensor aten_input = at::randn(shape, options);
|
|
|
|
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
|
|
aten_inputs = {aten_input};
|
|
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
|
|
|
|
return fec->getMostRecentKernelRuntime();
|
|
}
|
|
|
|
void LayerNormForward_ShapeInferenceBase(
|
|
benchmark::State& benchmark_state,
|
|
bool disable_launch_param_cache) {
|
|
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
|
|
FusionGuard fg(fusion_ptr.get());
|
|
|
|
// PreAllocate
|
|
std::unique_ptr<FusionExecutorCache> fec;
|
|
std::vector<at::IValue> aten_inputs;
|
|
|
|
std::vector<int64_t> shape{20, 100, 35, 67};
|
|
std::vector<int64_t> norm_shape{67};
|
|
|
|
auto runtime = getLayerForwardNormRuntime(
|
|
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
|
|
|
|
TORCH_INTERNAL_ASSERT(
|
|
runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
|
|
|
|
fec->profile(true);
|
|
fec->disableKernelLaunch();
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
|
|
if (disable_launch_param_cache) {
|
|
fec->disableLaunchParamCache();
|
|
}
|
|
|
|
for (auto _ : benchmark_state) {
|
|
// Setup (not included in the measurement)
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
}
|
|
}
|
|
|
|
static void LayerNormForward_NoShapeInferenceCachedBaseline(
|
|
benchmark::State& benchmark_state) {
|
|
LayerNormForward_ShapeInferenceBase(benchmark_state, false);
|
|
}
|
|
|
|
static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
|
|
LayerNormForward_ShapeInferenceBase(benchmark_state, true);
|
|
}
|
|
|
|
BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
|
|
BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
|
|
BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
|
|
->Unit(benchmark::kMicrosecond);
|
|
BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
|
|
->Unit(benchmark::kMicrosecond);
|