#include #include #include #include #include #include #include #include #include #include #include #define TRANSPOSE_CONFIG {true, false, false, false} using namespace torch::jit::fuser::cuda; struct TransposeConfig { bool input1_transpose_axes = false; bool input2_transpose_axes = false; bool intermediate_transpose_axes = false; bool output_transpose_axes = false; }; std::vector generateInputs( DataType dtype, int num_dims, std::pair axes, int perm_size, int innerdim_size, bool input1_transpose_axes, bool input2_transpose_axes, bool non_vectorize_offset = false, int iter_size = 32) { at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); std::vector transpose_shape(num_dims, iter_size); transpose_shape[axes.second] = innerdim_size; transpose_shape[axes.first] = perm_size; std::vector non_transpose_shape(num_dims, iter_size); non_transpose_shape[axes.first] = innerdim_size; non_transpose_shape[axes.second] = perm_size; // TensorType: Concrete, Contig, Symbolic // Vectorization | Unroll - Add 1 to sizes // Shift axis by 1 to disable vectorize loads if (non_vectorize_offset) { for (auto idx : c10::irange(transpose_shape.size())) { transpose_shape[idx] += 1; } for (auto idx : c10::irange(non_transpose_shape.size())) { non_transpose_shape[idx] += 1; } } auto optionalTransposeSize = [&transpose_shape, &non_transpose_shape](bool transpose_tensor) { return (transpose_tensor) ? transpose_shape : non_transpose_shape; }; at::Tensor aten_input1 = at::randn(optionalTransposeSize(input1_transpose_axes), options); at::Tensor aten_input2 = at::randn(optionalTransposeSize(input2_transpose_axes), options); return {aten_input1, aten_input2}; } //------------------------------------------------------------------------------ static void setupTranspose( Fusion* fusion, DataType dtype, int num_dims, std::pair axes, TransposeConfig tc) { FusionGuard fg(fusion); auto optionalTranspose = [axes](TensorView* tv, bool is_transpose) { return (is_transpose) ? transpose(tv, axes.first, axes.second) : tv; }; auto input1 = makeContigTensor(num_dims, dtype); auto input2 = makeContigTensor(num_dims, dtype); fusion->addInput(input1); fusion->addInput(input2); auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes); auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes); auto intermediate = add(ot_input1, ot_input2); auto ot_intermediate = optionalTranspose(intermediate, tc.intermediate_transpose_axes); auto output = relu(ot_intermediate); auto ot_output = optionalTranspose(output, tc.output_transpose_axes); fusion->addOutput(ot_output); } static void NvFuserScheduler_Transpose( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, DataType dtype, int num_dims, std::pair axes, TransposeConfig tc) { auto aten_inputs = generateInputs( dtype, num_dims, axes, benchmark_state.range(0), benchmark_state.range(1), tc.input1_transpose_axes, tc.input2_transpose_axes); auto at_input1 = aten_inputs[0]; auto at_input2 = aten_inputs[1]; std::vector fuser_inputs = {at_input1, at_input2}; runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * ((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype)))); } //------------------------------------------------------------------------------ #define NVFUSER_TRANSPOSE_SQUARE_RUN( \ TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \ NVFUSER_BENCHMARK_DEFINE( \ TITLE, \ setupTranspose, \ NvFuserScheduler_Transpose, \ DTYPE, \ NUM_DIMS, \ {AXIS1, AXIS2}, \ CONFIG); \ \ NVFUSER_BENCHMARK_RUN(TITLE) \ ->RangeMultiplier(8) \ ->Args({9, 2408}) \ ->Args({16, 512}) \ ->Args({18, 96}) \ ->Args({24, 96}) \ ->Args({24, 256}) \ ->Args({24, 512}) \ ->Args({32, 27}) \ ->Args({32, 96}) \ ->Args({32, 288}) \ ->Args({32, 864}) \ ->Args({40, 120}) \ ->Args({48, 128}) \ ->Args({48, 256}) \ ->Args({49, 512}) \ ->Args({49, 1024}) \ ->Args({49, 2048}) \ ->Args({49, 4608}) \ ->Args({64, 64}) \ ->Args({64, 96}) \ ->Args({64, 128}) \ ->Args({64, 147}) \ ->Args({64, 192}) \ ->Args({64, 256}) \ ->Args({64, 288}) \ ->Args({64, 512}) \ ->Args({80, 64}) \ ->Args({81, 1728}) \ ->Args({83, 1728}) \ ->Args({96, 864}) \ ->Args({100, 1280}) \ ->Args({100, 4032}) \ ->Args({120, 40}) \ ->Args({128, 128}) \ ->Args({128, 512}) \ ->Args({128, 1152}) \ ->Args({192, 128}) \ ->Args({192, 256}) \ ->Args({192, 720}) \ ->Args({192, 768}) \ ->Args({192, 1120}) \ ->Args({192, 1728}) \ ->Args({196, 256}) \ ->Args({196, 512}) \ ->Args({196, 1024}) \ ->Args({196, 2304}) \ ->Args({256, 256}) \ ->Args({256, 1024}) \ ->Args({256, 2304}) \ ->Args({284, 512}) \ ->Args({320, 1280}) \ ->Args({320, 1728}) \ ->Args({324, 2592}) \ ->Args({361, 768}) \ ->Args({361, 1120}) \ ->Args({384, 2}) \ ->Args({384, 32}) \ ->Args({384, 128}) \ ->Args({384, 256}) \ ->Args({384, 512}) \ ->Args({384, 1280}) \ ->Args({384, 2592}) \ ->Args({384, 4032}) \ ->Args({448, 1280}) \ ->Args({480, 16}) \ ->Args({480, 256}) \ ->Args({512, 2}) \ ->Args({512, 16}) \ ->Args({512, 128}) \ ->Args({512, 256}) \ ->Args({512, 1024}) \ ->Args({512, 2048}) \ ->Args({512, 3072}) \ ->Args({512, 4608}) \ ->Args({784, 40}) \ ->Args({784, 120}) \ ->Args({784, 128}) \ ->Args({784, 1152}) \ ->Args({1001, 2408}) \ ->Args({1024, 16}) \ ->Args({1024, 256}) \ ->Args({1024, 512}) \ ->Args({1024, 1024}) \ ->Args({1024, 3072}) \ ->Args({1369, 192}) \ ->Args({1369, 256}) \ ->Args({1369, 288}) \ ->Args({2048, 512}) \ ->Args({2048, 1024}) \ ->Args({2250, 27}) \ ->Args({3072, 512}) \ ->Args({3072, 1024}) \ ->Args({3136, 64}) \ ->Args({5329, 720}) \ ->Args({5625, 64}) \ ->Args({12544, 147}) \ ->Args({22201, 288}) \ ->Unit(benchmark::kMicrosecond) NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp32_Inner_2D_01_Axis, DataType::Float, 2 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp32_Inner_3D_02_Axis, DataType::Float, 3 /* num_dims */, 0 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp32_Inner_3D_12_Axis, DataType::Float, 3 /* num_dims */, 1 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp32_Outer_3D_01_Axis, DataType::Float, 3 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); //------------------------------------------------------------------------------ NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp16_Inner_2D_01_Axis, DataType::Half, 2 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp16_Inner_3D_02_Axis, DataType::Half, 3 /* num_dims */, 0 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp16_Inner_3D_12_Axis, DataType::Half, 3 /* num_dims */, 1 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_SQUARE_RUN( NF_Transpose_Random_fp16_Outer_3D_01_Axis, DataType::Half, 3 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); //------------------------------------------------------------------------------ #define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \ NVFUSER_BENCHMARK_DEFINE( \ TITLE, \ setupTranspose, \ NvFuserScheduler_Transpose, \ DTYPE, \ NUM_DIMS, \ {AXIS1, AXIS2}, \ CONFIG); \ \ NVFUSER_BENCHMARK_RUN(TITLE) \ ->RangeMultiplier(8) \ ->Ranges({{2, 256 * 256}, {160, 320}}) \ ->Unit(benchmark::kMicrosecond) \ NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp32_Inner_2D_01_Axis, DataType::Float, 2 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp32_Inner_3D_02_Axis, DataType::Float, 3 /* num_dims */, 0 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp32_Inner_3D_12_Axis, DataType::Float, 3 /* num_dims */, 1 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp32_Outer_3D_01_Axis, DataType::Float, 3 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); //------------------------------------------------------------------------------ NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp16_Inner_2D_01_Axis, DataType::Half, 2 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp16_Inner_3D_02_Axis, DataType::Half, 3 /* num_dims */, 0 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp16_Inner_3D_12_Axis, DataType::Half, 3 /* num_dims */, 1 /* axis1 */, 2 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); NVFUSER_TRANSPOSE_RUN( NF_Transpose_fp16_Outer_3D_01_Axis, DataType::Half, 3 /* num_dims */, 0 /* axis1 */, 1 /* axis2 */, TransposeConfig(TRANSPOSE_CONFIG)); //------------------------------------------------------------------------------ static void Baseline_Transpose( benchmark::State& benchmark_state, DataType dtype, int num_dims, std::pair axes, TransposeConfig tc) { auto aten_inputs = generateInputs( dtype, num_dims, axes, benchmark_state.range(0), benchmark_state.range(1), tc.input1_transpose_axes, tc.input2_transpose_axes); auto at_input1 = aten_inputs[0]; auto at_input2 = aten_inputs[1]; auto optionalTransposeAten = [&axes](at::Tensor x, bool is_transpose) { return (is_transpose) ? at::transpose(x, axes.first, axes.second) : x; }; for (auto _ : benchmark_state) { clearL2Cache(); CudaKernelTimer timer; auto at_ot_input1 = optionalTransposeAten(at_input1, tc.input1_transpose_axes); auto at_ot_input2 = optionalTransposeAten(at_input2, tc.input2_transpose_axes); auto at_intermediate = add(at_ot_input1, at_ot_input2); auto at_ot_intermediate = optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes); auto at_output = relu(at_ot_intermediate); auto at_ot_output = optionalTransposeAten(at_output, tc.output_transpose_axes); benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); } // Sync everything up before we're finished, don't want to run ahead on the // cpu while benchmarking. cudaDeviceSynchronize(); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (at_input1.numel() * 3 * int64_t(dataTypeSize(dtype)))); } //------------------------------------------------------------------------------ static void Baseline_Transpose_fp32_Inner_2D_01_Axis( benchmark::State& benchmark_state) { Baseline_Transpose( benchmark_state, DataType::Float, 2 /* num_dims */, {0, 1} /* axes */, TRANSPOSE_CONFIG); } static void Baseline_Transpose_fp16_Inner_2D_01_Axis( benchmark::State& benchmark_state) { Baseline_Transpose( benchmark_state, DataType::Half, 2 /* num_dims */, {0, 1} /* axes */, TRANSPOSE_CONFIG); } //------------------------------------------------------------------------------ BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis) // ->RangeMultiplier(2) ->Ranges({{2, 1024 * 1024}, {160, 320}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis) // ->RangeMultiplier(2) ->Ranges({{2, 1024 * 1024}, {160, 320}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------