#include #include #include #include #include #include #include #include #include #include #include "utils.h" using namespace torch::jit::fuser::cuda; //------------------------------------------------------------------------------ static void setupLayerNorm(Fusion* fusion, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); FusionGuard fg(fusion); const int kReductionAxis = 1; const float kEps = 1e-5; Double* eps_ptr = IrBuilder::create(kEps); // setup fusion auto input = makeContigTensor(2, dtype); auto weight = makeContigTensor(1, dtype); auto bias = makeContigTensor(1, dtype); fusion->addInput(input); fusion->addInput(weight); fusion->addInput(bias); if (dtype == DataType::Half) { input = castOp(DataType::Float, input); weight = castOp(DataType::Float, weight); bias = castOp(DataType::Float, bias); } auto layer_norm_results = layer_norm(input, 1, weight, bias, eps_ptr); auto output = layer_norm_results.output; if (dtype != DataType::Float) { output = castOp(dtype, output); } fusion->addOutput(output); } static void NvFuserScheduler_LayerNorm( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; const float kEps = 1e-5; // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor input = at::randn(input_shape, options); at::Tensor weight = at::randn({input_shape[1]}, options); at::Tensor bias = at::randn({input_shape[1]}, options); std::vector aten_inputs({input, weight, bias}); runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (2 * input.numel() + weight.numel() + bias.numel()) * int64_t(dataTypeSize(dtype))); } //------------------------------------------------------------------------------ static void Baseline_LayerNorm( benchmark::State& benchmark_state, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; const int kReductionAxis = 1; std::vector norm_shape; for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) { norm_shape.push_back(input_shape[idx]); } // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor input = at::randn(input_shape, options); at::Tensor weight = at::randn({input_shape[1]}, options); at::Tensor bias = at::randn({input_shape[1]}, options); clearL2Cache(); cudaDeviceSynchronize(); for (auto _ : benchmark_state) { CudaKernelTimer timer; auto output = at::layer_norm(input, norm_shape, weight, bias); benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); cudaDeviceSynchronize(); clearL2Cache(); cudaDeviceSynchronize(); } benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (2 * input.numel() + weight.numel() + bias.numel()) * int64_t(dataTypeSize(dtype))); } static void Baseline_LayerNorm_fp32(benchmark::State& benchmark_state) { Baseline_LayerNorm(benchmark_state, DataType::Float); } static void Baseline_LayerNorm_fp16(benchmark::State& benchmark_state) { Baseline_LayerNorm(benchmark_state, DataType::Half); } //------------------------------------------------------------------------------ NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_LayerNorm_fp32, setupLayerNorm, NvFuserScheduler_LayerNorm, DataType::Float); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_LayerNorm_fp16, setupLayerNorm, NvFuserScheduler_LayerNorm, DataType::Half); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ BENCHMARK(Baseline_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp32) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_fp16) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime();