pytorch/benchmarks/cpp/nvfuser/shape_inference.cpp

#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>

#include <benchmark/benchmark.h>

#include <cuda_runtime.h>

#include "utils.h"

using namespace torch::jit::fuser::cuda;

namespace {

// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
// but unknown sizes
TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
  return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
}

// Make a non-contiguous tensor of compile-time known sizes
TensorView* makeConcreteTensor(
    std::vector<int64_t> shape,
    DataType dtype = DataType::Float) {
  return TensorViewBuilder().shape(shape).dtype(dtype).build();
}

} // namespace

static auto getLayerBackwardNormRuntime(
    std::unique_ptr<Fusion> fusion_ptr,
    std::unique_ptr<FusionExecutorCache>& fec,
    std::vector<at::IValue>& aten_inputs,
    std::vector<int64_t>& shape,
    std::vector<int64_t>& norm_shape) {
  Fusion& fusion = *fusion_ptr.get();

  const size_t kM = shape.size();
  const size_t kN = norm_shape.size();
  const size_t kOuterNumDims = kM - kN;

  std::vector<int64_t> outer_shape;
  for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
    outer_shape.push_back(shape[idx]);
  }
  for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
    outer_shape.push_back(1);
  }

  auto grad_out = makeSymbolicTensor(shape.size());
  auto input = makeSymbolicTensor(shape.size());
  auto mean = makeConcreteTensor(outer_shape);
  auto rstd = makeConcreteTensor(outer_shape);
  auto weight = makeSymbolicTensor(norm_shape.size());
  auto bias = makeSymbolicTensor(norm_shape.size());
  fusion.addInput(grad_out);
  fusion.addInput(input);
  fusion.addInput(mean);
  fusion.addInput(rstd);
  fusion.addInput(weight);
  fusion.addInput(bias);

  auto grads = layer_norm_backward(
      grad_out,
      input,
      norm_shape,
      mean,
      rstd,
      weight,
      bias,
      {true, true, true});

  fusion.addOutput(grads.grad_input);
  fusion.addOutput(grads.grad_weight);
  fusion.addOutput(grads.grad_bias);

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  at::Tensor aten_grad_out = at::randn(shape, options);
  at::Tensor aten_input = at::randn(shape, options);
  at::Tensor aten_weight = at::randn(norm_shape, options);
  at::Tensor aten_bias = at::randn(norm_shape, options);
  auto at_weight = c10::optional<at::Tensor>(aten_weight);
  auto at_bias = c10::optional<at::Tensor>(aten_bias);

  const float kEps = 1e-5;
  auto aten_results =
      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
  auto aten_output = std::get<0>(aten_results);
  auto aten_mean = std::get<1>(aten_results);
  auto aten_rstd = std::get<2>(aten_results);

  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
  aten_inputs = {
      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);

  return fec->getMostRecentKernelRuntime();
}

void LayerNormBackward_ShapeInference_Base(
    benchmark::State& benchmark_state,
    bool disable_launch_parameter_cache) {
  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
  FusionGuard fg(fusion_ptr.get());

  // PreAllocate
  std::unique_ptr<FusionExecutorCache> fec;
  std::vector<at::IValue> aten_inputs;

  std::vector<int64_t> shape{20, 100, 35, 67};
  std::vector<int64_t> norm_shape{67};

  auto runtime = getLayerBackwardNormRuntime(
      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
  TORCH_INTERNAL_ASSERT(
      runtime->getMaybeHeuristicsFor(aten_inputs).has_value());

  fec->profile(true);
  fec->disableKernelLaunch();
  fec->runFusionWithInputs(aten_inputs);
  if (disable_launch_parameter_cache) {
    fec->disableLaunchParamCache();
  }

  for (auto _ : benchmark_state) {
    // Setup (not included in the measurement)
    fec->runFusionWithInputs(aten_inputs);
  }
}

static void LayerNormBackward_ShapeInference(
    benchmark::State& benchmark_state) {
  LayerNormBackward_ShapeInference_Base(benchmark_state, true);
}

static void LayerNormBackward_NoShapeInferenceCachedBaseline(
    benchmark::State& benchmark_state) {
  LayerNormBackward_ShapeInference_Base(benchmark_state, false);
}

static auto getLayerForwardNormRuntime(
    std::unique_ptr<Fusion> fusion_ptr,
    std::unique_ptr<FusionExecutorCache>& fec,
    std::vector<at::IValue>& aten_inputs,
    std::vector<int64_t>& shape,
    std::vector<int64_t>& norm_shape) {
  Fusion& fusion = *fusion_ptr.get();

  const float kEps = 1e-5;
  Double* eps_ptr = IrBuilder::create<Double>(kEps);

  auto input = makeSymbolicTensor(shape.size());
  fusion.addInput(input);

  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);

  fusion.addOutput(result.output);
  fusion.addOutput(result.mean);
  fusion.addOutput(result.invstd);

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  at::Tensor aten_input = at::randn(shape, options);

  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
  aten_inputs = {aten_input};
  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);

  return fec->getMostRecentKernelRuntime();
}

void LayerNormForward_ShapeInferenceBase(
    benchmark::State& benchmark_state,
    bool disable_launch_param_cache) {
  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
  FusionGuard fg(fusion_ptr.get());

  // PreAllocate
  std::unique_ptr<FusionExecutorCache> fec;
  std::vector<at::IValue> aten_inputs;

  std::vector<int64_t> shape{20, 100, 35, 67};
  std::vector<int64_t> norm_shape{67};

  auto runtime = getLayerForwardNormRuntime(
      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);

  TORCH_INTERNAL_ASSERT(
      runtime->getMaybeHeuristicsFor(aten_inputs).has_value());

  fec->profile(true);
  fec->disableKernelLaunch();
  fec->runFusionWithInputs(aten_inputs);

  if (disable_launch_param_cache) {
    fec->disableLaunchParamCache();
  }

  for (auto _ : benchmark_state) {
    // Setup (not included in the measurement)
    fec->runFusionWithInputs(aten_inputs);
  }
}

static void LayerNormForward_NoShapeInferenceCachedBaseline(
    benchmark::State& benchmark_state) {
  LayerNormForward_ShapeInferenceBase(benchmark_state, false);
}

static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
  LayerNormForward_ShapeInferenceBase(benchmark_state, true);
}

BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
    ->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
    ->Unit(benchmark::kMicrosecond);