#include #include #include #include #include #include #include #include #include #include #include "utils.h" using namespace torch::jit::fuser::cuda; namespace { // Make a tensor that is known to be non-contiguous of dimensionality=ndims, // but unknown sizes TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); } // Make a non-contiguous tensor of compile-time known sizes TensorView* makeConcreteTensor( std::vector shape, DataType dtype = DataType::Float) { return TensorViewBuilder().shape(shape).dtype(dtype).build(); } } // namespace static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, std::unique_ptr& fec, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { Fusion& fusion = *fusion_ptr.get(); const size_t kM = shape.size(); const size_t kN = norm_shape.size(); const size_t kOuterNumDims = kM - kN; std::vector outer_shape; for (size_t idx = 0; idx < kOuterNumDims; ++idx) { outer_shape.push_back(shape[idx]); } for (size_t idx = kOuterNumDims; idx < kM; ++idx) { outer_shape.push_back(1); } auto grad_out = makeSymbolicTensor(shape.size()); auto input = makeSymbolicTensor(shape.size()); auto mean = makeConcreteTensor(outer_shape); auto rstd = makeConcreteTensor(outer_shape); auto weight = makeSymbolicTensor(norm_shape.size()); auto bias = makeSymbolicTensor(norm_shape.size()); fusion.addInput(grad_out); fusion.addInput(input); fusion.addInput(mean); fusion.addInput(rstd); fusion.addInput(weight); fusion.addInput(bias); auto grads = layer_norm_backward( grad_out, input, norm_shape, mean, rstd, weight, bias, {true, true, true}); fusion.addOutput(grads.grad_input); fusion.addOutput(grads.grad_weight); fusion.addOutput(grads.grad_bias); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_grad_out = at::randn(shape, options); at::Tensor aten_input = at::randn(shape, options); at::Tensor aten_weight = at::randn(norm_shape, options); at::Tensor aten_bias = at::randn(norm_shape, options); auto at_weight = c10::optional(aten_weight); auto at_bias = c10::optional(aten_bias); const float kEps = 1e-5; auto aten_results = at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps); auto aten_output = std::get<0>(aten_results); auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); fec = std::make_unique(std::move(fusion_ptr)); aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; auto cg_outputs = fec->runFusionWithInputs(aten_inputs); return fec->getMostRecentKernelRuntime(); } void LayerNormBackward_ShapeInference_Base( benchmark::State& benchmark_state, bool disable_launch_parameter_cache) { std::unique_ptr fusion_ptr = std::make_unique(); FusionGuard fg(fusion_ptr.get()); // PreAllocate std::unique_ptr fec; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerBackwardNormRuntime( std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); TORCH_INTERNAL_ASSERT( runtime->getMaybeHeuristicsFor(aten_inputs).has_value()); fec->profile(true); fec->disableKernelLaunch(); fec->runFusionWithInputs(aten_inputs); if (disable_launch_parameter_cache) { fec->disableLaunchParamCache(); } for (auto _ : benchmark_state) { // Setup (not included in the measurement) fec->runFusionWithInputs(aten_inputs); } } static void LayerNormBackward_ShapeInference( benchmark::State& benchmark_state) { LayerNormBackward_ShapeInference_Base(benchmark_state, true); } static void LayerNormBackward_NoShapeInferenceCachedBaseline( benchmark::State& benchmark_state) { LayerNormBackward_ShapeInference_Base(benchmark_state, false); } static auto getLayerForwardNormRuntime( std::unique_ptr fusion_ptr, std::unique_ptr& fec, std::vector& aten_inputs, std::vector& shape, std::vector& norm_shape) { Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr); fusion.addOutput(result.output); fusion.addOutput(result.mean); fusion.addOutput(result.invstd); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(shape, options); fec = std::make_unique(std::move(fusion_ptr)); aten_inputs = {aten_input}; auto cg_outputs = fec->runFusionWithInputs(aten_inputs); return fec->getMostRecentKernelRuntime(); } void LayerNormForward_ShapeInferenceBase( benchmark::State& benchmark_state, bool disable_launch_param_cache) { std::unique_ptr fusion_ptr = std::make_unique(); FusionGuard fg(fusion_ptr.get()); // PreAllocate std::unique_ptr fec; std::vector aten_inputs; std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto runtime = getLayerForwardNormRuntime( std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape); TORCH_INTERNAL_ASSERT( runtime->getMaybeHeuristicsFor(aten_inputs).has_value()); fec->profile(true); fec->disableKernelLaunch(); fec->runFusionWithInputs(aten_inputs); if (disable_launch_param_cache) { fec->disableLaunchParamCache(); } for (auto _ : benchmark_state) { // Setup (not included in the measurement) fec->runFusionWithInputs(aten_inputs); } } static void LayerNormForward_NoShapeInferenceCachedBaseline( benchmark::State& benchmark_state) { LayerNormForward_ShapeInferenceBase(benchmark_state, false); } static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) { LayerNormForward_ShapeInferenceBase(benchmark_state, true); } BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond);