#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>

#include <benchmark/benchmark.h>

#include <cuda_runtime.h>

#include <benchmarks/cpp/nvfuser/utils.h>

using namespace torch::jit::fuser::cuda;

static void setupSBR(Fusion* fusion, DataType dtype) {
  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);

  FusionGuard fg(fusion);

  const size_t kNumberOfDims = 4;

  std::vector<int64_t> bcast_shape(kNumberOfDims, 1);
  bcast_shape[bcast_shape.size() - 1] = -1;

  std::vector<bool> bcast_contig(kNumberOfDims, false);
  bcast_contig[bcast_contig.size() - 1] = true;

  auto x = makeContigTensor(kNumberOfDims, dtype);

  auto scale = TensorViewBuilder()
                   .contiguity(bcast_contig)
                   .shape(bcast_shape)
                   .dtype(dtype)
                   .build();

  auto bias = TensorViewBuilder()
                  .contiguity(bcast_contig)
                  .shape(bcast_shape)
                  .dtype(dtype)
                  .build();

  fusion->addInput(x);
  fusion->addInput(scale);
  fusion->addInput(bias);

  if (dtype == DataType::Half) {
    x = castOp(DataType::Float, x);
    scale = castOp(DataType::Float, scale);
    bias = castOp(DataType::Float, bias);
  }

  auto scale_bias = add(mul(x, scale), bias);
  auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);

  if (dtype == DataType::Half) {
    scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
  }
  fusion->addOutput(scale_bias_relu);
}

static void setupSBRNorm(Fusion* fusion, DataType dtype) {
  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
  FusionGuard fg(fusion);

  const size_t kNumberOfDims = 4;

  auto x = makeContigTensor(kNumberOfDims, dtype);
  auto weight = makeContigTensor(1, dtype);
  auto bias = makeContigTensor(1, dtype);
  auto mean = makeContigTensor(1, dtype);
  auto var = makeContigTensor(1, dtype);

  fusion->addInput(x);
  fusion->addInput(weight);
  fusion->addInput(bias);
  fusion->addInput(mean);
  fusion->addInput(var);

  std::vector<bool> broadcast_mask(kNumberOfDims, true);
  broadcast_mask[broadcast_mask.size() - 1] = false;

  if (dtype == DataType::Half) {
    x = castOp(DataType::Float, x);
    weight = castOp(DataType::Float, weight);
    bias = castOp(DataType::Float, bias);
    mean = castOp(DataType::Float, mean);
    var = castOp(DataType::Float, var);
  }

  auto rsqrt = unaryOp(UnaryOpType::Rsqrt, var);
  auto this_scale = mul(weight, rsqrt);
  auto this_bias = mul(sub(bias, mean), this_scale);

  auto bcast_scale = broadcast(this_scale, broadcast_mask);
  auto bcast_bias = broadcast(this_bias, broadcast_mask);

  auto scale_bias = add(mul(x, bcast_scale), bcast_bias);
  auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);

  if (dtype == DataType::Half) {
    scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
  }

  fusion->addOutput(scale_bias_relu);
}

//------------------------------------------------------------------------------

static void NvFuserScheduler_SBR(
    benchmark::State& benchmark_state,
    FusionExecutorCache* fusion_executor_cache,
    DataType dtype) {
  // N, H, W, C format
  std::vector<int64_t> input_shape{
      benchmark_state.range(0),
      benchmark_state.range(1),
      benchmark_state.range(1),
      benchmark_state.range(2)};
  std::vector<int64_t> bcast_shape{1, 1, 1, -1};

  // inputs
  at::manual_seed(0);
  std::vector<int64_t> static_bcast_shape{1, 1, 1, benchmark_state.range(2)};
  auto options =
      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
  at::Tensor at_x = at::randn(input_shape, options);
  at::Tensor at_scale = at::ones(static_bcast_shape, options);
  at::Tensor at_bias = at::zeros(static_bcast_shape, options);

  // inputs
  std::vector<c10::IValue> aten_inputs = {at_x, at_scale, at_bias};

  fusion_executor_cache->profile(true);
  fusion_executor_cache->runFusionWithInputs(aten_inputs);

  auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
  auto executor_instance = compile_log.fusion_executor;
  auto params = toString(compile_log.params);
  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());

  benchmark_state.SetLabel(params + lparams);
  benchmark_state.SetLabel(lparams);

  fusion_executor_cache->profile(false);
  executor_instance->setMeasureKernelTimeFlag(true);
  // Sync everything up before we start
  cudaDeviceSynchronize();
  for (auto _ : benchmark_state) {
    clearL2Cache();
    auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
    benchmark_state.SetIterationTime(
        executor_instance->kernelTimeMs() / 1000.0);
  }
  // Sync everything up before we're finished, don't want to run ahead on the
  // cpu while benchmarking.
  cudaDeviceSynchronize();

  const size_t size =
      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
  const size_t channels = input_shape[3];
  benchmark_state.SetBytesProcessed(
      int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
      int64_t(dataTypeSize(dtype)));
}

static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) {
  // N, H, W, C format
  std::vector<int64_t> input_shape{
      benchmark_state.range(0),
      benchmark_state.range(1),
      benchmark_state.range(1),
      benchmark_state.range(2)};
  std::vector<int64_t> bcast_shape{benchmark_state.range(2)};

  // inputs
  at::manual_seed(0);
  auto options =
      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
  at::Tensor at_x = at::randn(input_shape, options);
  at::Tensor at_y = at::randn(input_shape, options);
  at::Tensor at_scale = at::ones(bcast_shape, options);
  at::Tensor at_bias = at::zeros(bcast_shape, options);

  clearL2Cache();
  cudaDeviceSynchronize();
  for (auto _ : benchmark_state) {
    CudaKernelTimer timer;

    auto scale = at::mul(at_x, at_scale);
    auto bias = at::add(scale, at_bias);
    auto output = at::relu(bias);

    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
    cudaDeviceSynchronize();
    clearL2Cache();
    cudaDeviceSynchronize();
  }

  const size_t size =
      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
  const size_t channels = input_shape[3];
  benchmark_state.SetBytesProcessed(
      int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
      int64_t(dataTypeSize(dtype)));
}

//------------------------------------------------------------------------------

static void NvFuserScheduler_SBR_Norm(
    benchmark::State& benchmark_state,
    FusionExecutorCache* fusion_executor_cache,
    DataType dtype) {
  // N, H, W, C format
  std::vector<int64_t> input_shape{
      benchmark_state.range(0),
      benchmark_state.range(1),
      benchmark_state.range(1),
      benchmark_state.range(2)};
  std::vector<int64_t> bcast_shape{benchmark_state.range(2)};

  // inputs
  at::manual_seed(0);
  auto options =
      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
  at::Tensor at_x = at::randn(input_shape, options);
  at::Tensor at_weight = at::ones(bcast_shape, options);
  at::Tensor at_bias = at::zeros(bcast_shape, options);
  at::Tensor at_mean = at::zeros(bcast_shape, options);
  at::Tensor at_var = at::ones(bcast_shape, options);

  // inputs
  std::vector<c10::IValue> aten_inputs = {
      at_x, at_weight, at_bias, at_mean, at_var};

  fusion_executor_cache->profile(true);
  fusion_executor_cache->runFusionWithInputs(aten_inputs);

  auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
  auto executor_instance = compile_log.fusion_executor;
  auto params = toString(compile_log.params);
  auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());

  benchmark_state.SetLabel(params + lparams);

  fusion_executor_cache->profile(false);
  executor_instance->setMeasureKernelTimeFlag(true);
  // Sync everything up before we start
  cudaDeviceSynchronize();
  for (auto _ : benchmark_state) {
    clearL2Cache();
    auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
    benchmark_state.SetIterationTime(
        executor_instance->kernelTimeMs() / 1000.0);
  }

  // Sync everything up before we're finished, don't want to run ahead on the
  // cpu while benchmarking.
  cudaDeviceSynchronize();

  const size_t size =
      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
  const size_t channels = input_shape[3];
  benchmark_state.SetBytesProcessed(
      int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
      int64_t(dataTypeSize(dtype)));
}

static void Baseline_SBR_Norm(
    benchmark::State& benchmark_state,
    DataType dtype) {
  // N, H, W, C format
  std::vector<int64_t> input_shape{
      benchmark_state.range(0),
      benchmark_state.range(1),
      benchmark_state.range(1),
      benchmark_state.range(2)};
  std::vector<int64_t> bcast_shape{1, 1, 1, benchmark_state.range(2)};

  // inputs
  at::manual_seed(0);
  auto options =
      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
  at::Tensor at_x = at::randn(input_shape, options);
  at::Tensor at_weight = at::ones(bcast_shape, options);
  at::Tensor at_bias = at::zeros(bcast_shape, options);
  at::Tensor at_mean = at::zeros(bcast_shape, options);
  at::Tensor at_var = at::ones(bcast_shape, options);

  cudaDeviceSynchronize();
  for (auto _ : benchmark_state) {
    CudaKernelTimer timer;

    auto this_scale = at::mul(at_weight, at::rsqrt(at_var));
    auto this_bias = at::mul(at::sub(at_bias, at_mean), this_scale);

    auto scale = at::mul(at_x, this_scale);
    auto bias = at::add(scale, this_bias);
    auto output = at::relu(bias);

    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
    cudaDeviceSynchronize();
  }

  const size_t size =
      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
  const size_t channels = input_shape[3];
  benchmark_state.SetBytesProcessed(
      int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
      int64_t(dataTypeSize(dtype)));
}

//------------------------------------------------------------------------------

NVFUSER_BENCHMARK_DEFINE(
    NvFuserScheduler_SBR_fp32,
    setupSBR,
    NvFuserScheduler_SBR,
    DataType::Float);

NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp32)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

NVFUSER_BENCHMARK_DEFINE(
    NvFuserScheduler_SBR_fp16,
    setupSBR,
    NvFuserScheduler_SBR,
    DataType::Half);

NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp16)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

//------------------------------------------------------------------------------

NVFUSER_BENCHMARK_DEFINE(
    NvFuserScheduler_SBR_Norm_fp32,
    setupSBRNorm,
    NvFuserScheduler_SBR_Norm,
    DataType::Float);

NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp32)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

NVFUSER_BENCHMARK_DEFINE(
    NvFuserScheduler_SBR_Norm_fp16,
    setupSBRNorm,
    NvFuserScheduler_SBR_Norm,
    DataType::Half);

NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp16)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

//------------------------------------------------------------------------------

static void Baseline_SBR_fp32(benchmark::State& benchmark_state) {
  Baseline_SBR(benchmark_state, DataType::Float);
}

BENCHMARK(Baseline_SBR_fp32)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

static void Baseline_SBR_fp16(benchmark::State& benchmark_state) {
  Baseline_SBR(benchmark_state, DataType::Half);
}

BENCHMARK(Baseline_SBR_fp16)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

//------------------------------------------------------------------------------

static void Baseline_SBR_Norm_fp32(benchmark::State& benchmark_state) {
  Baseline_SBR_Norm(benchmark_state, DataType::Float);
}

BENCHMARK(Baseline_SBR_Norm_fp32)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();

static void Baseline_SBR_Norm_fp16(benchmark::State& benchmark_state) {
  Baseline_SBR_Norm(benchmark_state, DataType::Half);
}

BENCHMARK(Baseline_SBR_Norm_fp16)
    // ->RangeMultiplier(2)
    ->Ranges({{8, 8}, {640, 640}, {64, 128}})
    ->Unit(benchmark::kMicrosecond)
    ->UseManualTime();