#include #include #include #include #include #include #include #include using namespace torch::jit::fuser::cuda; static void setupSBR(Fusion* fusion, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); FusionGuard fg(fusion); const size_t kNumberOfDims = 4; std::vector bcast_shape(kNumberOfDims, 1); bcast_shape[bcast_shape.size() - 1] = -1; std::vector bcast_contig(kNumberOfDims, false); bcast_contig[bcast_contig.size() - 1] = true; auto x = makeContigTensor(kNumberOfDims, dtype); auto scale = TensorViewBuilder() .contiguity(bcast_contig) .shape(bcast_shape) .dtype(dtype) .build(); auto bias = TensorViewBuilder() .contiguity(bcast_contig) .shape(bcast_shape) .dtype(dtype) .build(); fusion->addInput(x); fusion->addInput(scale); fusion->addInput(bias); if (dtype == DataType::Half) { x = castOp(DataType::Float, x); scale = castOp(DataType::Float, scale); bias = castOp(DataType::Float, bias); } auto scale_bias = add(mul(x, scale), bias); auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias); if (dtype == DataType::Half) { scale_bias_relu = castOp(DataType::Half, scale_bias_relu); } fusion->addOutput(scale_bias_relu); } static void setupSBRNorm(Fusion* fusion, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); FusionGuard fg(fusion); const size_t kNumberOfDims = 4; auto x = makeContigTensor(kNumberOfDims, dtype); auto weight = makeContigTensor(1, dtype); auto bias = makeContigTensor(1, dtype); auto mean = makeContigTensor(1, dtype); auto var = makeContigTensor(1, dtype); fusion->addInput(x); fusion->addInput(weight); fusion->addInput(bias); fusion->addInput(mean); fusion->addInput(var); std::vector broadcast_mask(kNumberOfDims, true); broadcast_mask[broadcast_mask.size() - 1] = false; if (dtype == DataType::Half) { x = castOp(DataType::Float, x); weight = castOp(DataType::Float, weight); bias = castOp(DataType::Float, bias); mean = castOp(DataType::Float, mean); var = castOp(DataType::Float, var); } auto rsqrt = unaryOp(UnaryOpType::Rsqrt, var); auto this_scale = mul(weight, rsqrt); auto this_bias = mul(sub(bias, mean), this_scale); auto bcast_scale = broadcast(this_scale, broadcast_mask); auto bcast_bias = broadcast(this_bias, broadcast_mask); auto scale_bias = add(mul(x, bcast_scale), bcast_bias); auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias); if (dtype == DataType::Half) { scale_bias_relu = castOp(DataType::Half, scale_bias_relu); } fusion->addOutput(scale_bias_relu); } //------------------------------------------------------------------------------ static void NvFuserScheduler_SBR( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, DataType dtype) { // N, H, W, C format std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1), benchmark_state.range(1), benchmark_state.range(2)}; std::vector bcast_shape{1, 1, 1, -1}; // inputs at::manual_seed(0); std::vector static_bcast_shape{1, 1, 1, benchmark_state.range(2)}; auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); at::Tensor at_scale = at::ones(static_bcast_shape, options); at::Tensor at_bias = at::zeros(static_bcast_shape, options); // inputs std::vector aten_inputs = {at_x, at_scale, at_bias}; fusion_executor_cache->profile(true); fusion_executor_cache->runFusionWithInputs(aten_inputs); auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; auto params = toString(compile_log.params); auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); benchmark_state.SetLabel(params + lparams); benchmark_state.SetLabel(lparams); fusion_executor_cache->profile(false); executor_instance->setMeasureKernelTimeFlag(true); // Sync everything up before we start cudaDeviceSynchronize(); for (auto _ : benchmark_state) { clearL2Cache(); auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs); benchmark_state.SetIterationTime( executor_instance->kernelTimeMs() / 1000.0); } // Sync everything up before we're finished, don't want to run ahead on the // cpu while benchmarking. cudaDeviceSynchronize(); const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; const size_t channels = input_shape[3]; benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) * int64_t(dataTypeSize(dtype))); } static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) { // N, H, W, C format std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1), benchmark_state.range(1), benchmark_state.range(2)}; std::vector bcast_shape{benchmark_state.range(2)}; // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); at::Tensor at_y = at::randn(input_shape, options); at::Tensor at_scale = at::ones(bcast_shape, options); at::Tensor at_bias = at::zeros(bcast_shape, options); clearL2Cache(); cudaDeviceSynchronize(); for (auto _ : benchmark_state) { CudaKernelTimer timer; auto scale = at::mul(at_x, at_scale); auto bias = at::add(scale, at_bias); auto output = at::relu(bias); benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); cudaDeviceSynchronize(); clearL2Cache(); cudaDeviceSynchronize(); } const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; const size_t channels = input_shape[3]; benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) * int64_t(dataTypeSize(dtype))); } //------------------------------------------------------------------------------ static void NvFuserScheduler_SBR_Norm( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, DataType dtype) { // N, H, W, C format std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1), benchmark_state.range(1), benchmark_state.range(2)}; std::vector bcast_shape{benchmark_state.range(2)}; // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); at::Tensor at_weight = at::ones(bcast_shape, options); at::Tensor at_bias = at::zeros(bcast_shape, options); at::Tensor at_mean = at::zeros(bcast_shape, options); at::Tensor at_var = at::ones(bcast_shape, options); // inputs std::vector aten_inputs = { at_x, at_weight, at_bias, at_mean, at_var}; fusion_executor_cache->profile(true); fusion_executor_cache->runFusionWithInputs(aten_inputs); auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; auto params = toString(compile_log.params); auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); benchmark_state.SetLabel(params + lparams); fusion_executor_cache->profile(false); executor_instance->setMeasureKernelTimeFlag(true); // Sync everything up before we start cudaDeviceSynchronize(); for (auto _ : benchmark_state) { clearL2Cache(); auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs); benchmark_state.SetIterationTime( executor_instance->kernelTimeMs() / 1000.0); } // Sync everything up before we're finished, don't want to run ahead on the // cpu while benchmarking. cudaDeviceSynchronize(); const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; const size_t channels = input_shape[3]; benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) * int64_t(dataTypeSize(dtype))); } static void Baseline_SBR_Norm( benchmark::State& benchmark_state, DataType dtype) { // N, H, W, C format std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1), benchmark_state.range(1), benchmark_state.range(2)}; std::vector bcast_shape{1, 1, 1, benchmark_state.range(2)}; // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); at::Tensor at_weight = at::ones(bcast_shape, options); at::Tensor at_bias = at::zeros(bcast_shape, options); at::Tensor at_mean = at::zeros(bcast_shape, options); at::Tensor at_var = at::ones(bcast_shape, options); cudaDeviceSynchronize(); for (auto _ : benchmark_state) { CudaKernelTimer timer; auto this_scale = at::mul(at_weight, at::rsqrt(at_var)); auto this_bias = at::mul(at::sub(at_bias, at_mean), this_scale); auto scale = at::mul(at_x, this_scale); auto bias = at::add(scale, this_bias); auto output = at::relu(bias); benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); cudaDeviceSynchronize(); } const size_t size = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; const size_t channels = input_shape[3]; benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) * int64_t(dataTypeSize(dtype))); } //------------------------------------------------------------------------------ NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_SBR_fp32, setupSBR, NvFuserScheduler_SBR, DataType::Float); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp32) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_SBR_fp16, setupSBR, NvFuserScheduler_SBR, DataType::Half); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp16) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_SBR_Norm_fp32, setupSBRNorm, NvFuserScheduler_SBR_Norm, DataType::Float); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp32) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_SBR_Norm_fp16, setupSBRNorm, NvFuserScheduler_SBR_Norm, DataType::Half); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp16) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ static void Baseline_SBR_fp32(benchmark::State& benchmark_state) { Baseline_SBR(benchmark_state, DataType::Float); } BENCHMARK(Baseline_SBR_fp32) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); static void Baseline_SBR_fp16(benchmark::State& benchmark_state) { Baseline_SBR(benchmark_state, DataType::Half); } BENCHMARK(Baseline_SBR_fp16) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ static void Baseline_SBR_Norm_fp32(benchmark::State& benchmark_state) { Baseline_SBR_Norm(benchmark_state, DataType::Float); } BENCHMARK(Baseline_SBR_Norm_fp32) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); static void Baseline_SBR_Norm_fp16(benchmark::State& benchmark_state) { Baseline_SBR_Norm(benchmark_state, DataType::Half); } BENCHMARK(Baseline_SBR_Norm_fp16) // ->RangeMultiplier(2) ->Ranges({{8, 8}, {640, 640}, {64, 128}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime();