[PyTorch][Vulkan] Refactor performance test binary (#114712)

Summary:
We create two files `vulkan_perf_utils.h` and `vulkan_perf_utils.cpp` which hosts several shared functions among the `perf_test` source files:
- `makeStack`
- `callOpByHandle`
- `callOpByName`
- `extractTotalShaderResultsAndSetState`
- `extractTotalOpResultsAndSetState`

so that they can be used for all perf tests.

Test Plan:
We test `vulkan_conv_arithmetic_perf_test`, `vulkan_layernorm_perf_test` and `vulkan_mm_perf_test` respectively as below.
- build binary, at `fbsource`
```
buck2 build  -c ndk.debug_info_level=0  -c ndk.static_linking=true -c pt.enable_qpl=0 -c pt.vulkan_use_gpu_diagnostics=1 --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_layernorm_perf_test_binAndroid  --show-output  -c pt.vulkan_full_precision=1
buck2 build  -c ndk.debug_info_level=0  -c ndk.static_linking=true -c pt.enable_qpl=0 -c pt.vulkan_use_gpu_diagnostics=1 --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_conv_arithmetic_perf_test_binAndroid  --show-output  -c pt.vulkan_full_precision=1
buck2 build  -c ndk.debug_info_level=0  -c ndk.static_linking=true -c pt.enable_qpl=0 -c pt.vulkan_use_gpu_diagnostics=1 --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_mm_perf_test_binAndroid  --show-output  -c pt.vulkan_full_precision=1
```
- push to device
```
adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_conv_arithmetic_perf_test_binAndroid__/pt_vulkan_conv_arithmetic_perf_test_binAndroid /data/local/tmp
adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_mm_perf_test_binAndroid__/pt_vulkan_mm_perf_test_binAndroid /data/local/tmp
adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_mm_perf_test_binAndroid__/pt_vulkan_mm_perf_test_binAndroid /data/local/tmp
```
- test on device

```
adb shell /data/local/tmp/pt_vulkan_mm_perf_test_binAndroid
adb shell /data/local/tmp/pt_vulkan_layernorm_perf_test_binAndroid
adb shell /data/local/tmp/pt_vulkan_conv_arithmetic_perf_test_binAndroid
```
full results:
vulkan_mm_perf_test: P887658084
vulkan_layernorm_perf_test P887687924
vulkan_conv_arithmetic_perf_test P887689880

Reviewed By: yipjustin, liuk22

Differential Revision: D51451751

Pull Request resolved: https://github.com/pytorch/pytorch/pull/114712
Approved by: https://github.com/yipjustin
This commit is contained in:
Wei Lu 2023-12-04 19:49:50 +00:00 committed by PyTorch MergeBot
parent 62df4f3428
commit 6317a0350e
4 changed files with 77 additions and 236 deletions

View File

@ -1,133 +0,0 @@
#include <unordered_map>
#ifdef USE_VULKAN_API
#include <benchmark/benchmark.h>
#include <ATen/ATen.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/native/vulkan/api/api.h>
#include <ATen/native/vulkan/ops/Common.h>
#include <ATen/native/vulkan/ops/Copy.h>
#include <ATen/native/vulkan/ops/Factory.h>
#include <ATen/native/vulkan/ops/QuantizedFunctions.h>
#include <ATen/native/vulkan/ops/Utils.h>
namespace {
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
static const float NANOSECONDS_IN_SECOND = 1000000000.0;
#endif
template <class... Inputs>
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
return {std::forward<Inputs>(inputs)...};
}
template <class... Args>
inline std::vector<c10::IValue> callOpByHandle(
const c10::OperatorHandle& op,
Args... args) {
auto stack = makeStack(std::forward<Args>(args)...);
c10::Dispatcher::singleton().callBoxed(op, &stack);
return stack;
}
template <class... Args>
inline std::vector<c10::IValue> callOpByName(
const char* func_name,
const char* overload_name,
Args... args) {
const c10::optional<c10::OperatorHandle> op_handle =
c10::Dispatcher::singleton().findSchema({func_name, overload_name});
assert(op_handle.has_value());
return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
}
static void CommonMMBenchmarkSettings(benchmark::internal::Benchmark* b) {
b->Unit(benchmark::kMillisecond);
b->ArgNames({"N", "M", "P"});
}
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
// This function aggregate the latency of all invoked shaders except
// `vulkan.nchw_to_image` and `vulkan.image_to_nchw`, which are moving data
// between CPU and GPU memory.
static void extractTotalShaderResultsAndSetState(benchmark::State& state) {
at::native::vulkan::api::context()->querypool().extract_results();
uint64_t sum_shader_latency_in_nanoseconds = 0;
auto result_aggregator =
[&sum_shader_latency_in_nanoseconds](
const at::native::vulkan::api::ShaderDuration& s) {
if (s.kernel_name != "vulkan.nchw_to_image" &&
s.kernel_name != "vulkan.image_to_nchw") {
sum_shader_latency_in_nanoseconds += s.execution_duration_ns;
}
};
at::native::vulkan::api::context()->querypool().shader_log_for_each(
result_aggregator);
float sum_shader_latency_in_seconds =
sum_shader_latency_in_nanoseconds / NANOSECONDS_IN_SECOND;
state.SetIterationTime(sum_shader_latency_in_seconds);
}
#endif
static void layer_norm_benchmark(benchmark::State& state) {
// Guard
if (!at::is_vulkan_available()) {
return;
}
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
at::native::vulkan::api::context()->enable_op_profiling();
at::native::vulkan::api::context()->reset_querypool();
#endif
c10::InferenceMode mode;
// Arrange
const auto c = state.range(0);
const auto h = state.range(1);
const auto w = state.range(2);
const auto in_cpu =
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
const auto in_vulkan = in_cpu.vulkan();
const auto weight_cpu =
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
const auto weight_vulkan = weight_cpu.vulkan();
const auto bias_cpu =
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
const auto bias_vulkan = bias_cpu.vulkan();
// Act
for (auto _ : state) {
const auto vulkan_out =
at::layer_norm(
in_vulkan, {c, h, w}, weight_vulkan, bias_vulkan, 1e-05, false)
.cpu();
}
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
extractTotalShaderResultsAndSetState(state);
at::native::vulkan::api::context()->querypool().print_results();
#endif
}
} // namespace
const uint32_t BENCHMARK_MM_N = 75;
const uint32_t BENCHMARK_MM_M = 75;
const uint32_t BENCHMARK_MM_P = 75;
const uint32_t BENCHMARK_ITERATIONS = 50;
BENCHMARK(layer_norm_benchmark)
->Apply(CommonMMBenchmarkSettings)
->UseManualTime()
->Threads(1)
->Iterations(BENCHMARK_ITERATIONS)
->Args({BENCHMARK_MM_N, BENCHMARK_MM_M, BENCHMARK_MM_P});
BENCHMARK_MAIN();
#endif /* USE_VULKAN_API */

View File

@ -1,15 +1,7 @@
#include <unordered_map>
#ifdef USE_VULKAN_API
#include <benchmark/benchmark.h>
#include <ATen/ATen.h>
#include <ATen/native/vulkan/api/api.h>
#include <ATen/native/vulkan/ops/Common.h>
#include <ATen/native/vulkan/ops/Copy.h>
#include <ATen/native/vulkan/ops/Factory.h>
#include <ATen/native/vulkan/ops/QuantizedFunctions.h>
#include <ATen/native/vulkan/ops/Utils.h>
#include "vulkan_perf_utils.h"
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
#include <iostream>
#endif
@ -18,10 +10,6 @@ namespace {
namespace vulkan_api = at::native::vulkan::api;
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
static const float NANOSECONDS_IN_SECOND = 1000000000.0;
#endif
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
void report_pep(const std::string& name, const uint64_t duration) {
std::stringstream buffer;
@ -61,18 +49,6 @@ void report_aibench_res(vulkan_api::QueryPool& qpool) {
}
#endif
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
static void extractTotalOpResultsAndSetState(
benchmark::State& state,
const char* op_name) {
at::native::vulkan::api::context()->querypool().extract_results();
float total_op_time =
at::native::vulkan::api::context()->querypool().get_total_op_ns(op_name) /
NANOSECONDS_IN_SECOND;
state.SetIterationTime(total_op_time);
}
#endif
at::Tensor vulkan_to_cpu(const at::Tensor& vulkan, const at::Tensor& in_cpu) {
auto q_options = in_cpu.options();
if (q_options.dtype().toScalarType() == c10::ScalarType::QUInt8) {

View File

@ -0,0 +1,71 @@
#include <unordered_map>
#ifdef USE_VULKAN_API
#include "vulkan_perf_utils.h"
namespace {
static void CommonMMBenchmarkSettings(benchmark::internal::Benchmark* b) {
b->Unit(benchmark::kMillisecond);
b->ArgNames({"N", "M", "P"});
}
static void layer_norm_benchmark(benchmark::State& state) {
// Guard
if (!at::is_vulkan_available()) {
return;
}
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
at::native::vulkan::api::context()->enable_op_profiling();
at::native::vulkan::api::context()->reset_querypool();
#endif
c10::InferenceMode mode;
// Arrange
const auto c = state.range(0);
const auto h = state.range(1);
const auto w = state.range(2);
const auto in_cpu =
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
const auto in_vulkan = in_cpu.vulkan();
const auto weight_cpu =
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
const auto weight_vulkan = weight_cpu.vulkan();
const auto bias_cpu =
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
const auto bias_vulkan = bias_cpu.vulkan();
// Act
for (auto _ : state) {
const auto vulkan_out =
at::layer_norm(
in_vulkan, {c, h, w}, weight_vulkan, bias_vulkan, 1e-05, false)
.cpu();
}
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
extractTotalShaderResultsAndSetState(state);
at::native::vulkan::api::context()->querypool().print_results();
#endif
}
} // namespace
const uint32_t BENCHMARK_MM_N = 75;
const uint32_t BENCHMARK_MM_M = 75;
const uint32_t BENCHMARK_MM_P = 75;
const uint32_t BENCHMARK_ITERATIONS = 50;
BENCHMARK(layer_norm_benchmark)
->Apply(CommonMMBenchmarkSettings)
->UseManualTime()
->Threads(1)
->Iterations(BENCHMARK_ITERATIONS)
->Args({BENCHMARK_MM_N, BENCHMARK_MM_M, BENCHMARK_MM_P});
BENCHMARK_MAIN();
#endif /* USE_VULKAN_API */

View File

@ -1,88 +1,14 @@
#include <unordered_map>
#ifdef USE_VULKAN_API
#include <benchmark/benchmark.h>
#include <ATen/ATen.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/native/vulkan/api/api.h>
#include <ATen/native/vulkan/ops/Common.h>
#include <ATen/native/vulkan/ops/Copy.h>
#include <ATen/native/vulkan/ops/Factory.h>
#include <ATen/native/vulkan/ops/QuantizedFunctions.h>
#include <ATen/native/vulkan/ops/Utils.h>
#include "vulkan_perf_utils.h"
namespace {
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
static const float NANOSECONDS_IN_SECOND = 1000000000.0;
#endif
template <class... Inputs>
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
return {std::forward<Inputs>(inputs)...};
}
template <class... Args>
inline std::vector<c10::IValue> callOpByHandle(
const c10::OperatorHandle& op,
Args... args) {
auto stack = makeStack(std::forward<Args>(args)...);
c10::Dispatcher::singleton().callBoxed(op, &stack);
return stack;
}
template <class... Args>
inline std::vector<c10::IValue> callOpByName(
const char* func_name,
const char* overload_name,
Args... args) {
const c10::optional<c10::OperatorHandle> op_handle =
c10::Dispatcher::singleton().findSchema({func_name, overload_name});
assert(op_handle.has_value());
return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
}
static void CommonMMBenchmarkSettings(benchmark::internal::Benchmark* b) {
b->Unit(benchmark::kMillisecond);
b->ArgNames({"N", "M", "P"});
}
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
static void extractTotalOpResultsAndSetState(
benchmark::State& state,
const char* op_name) {
at::native::vulkan::api::context()->querypool().extract_results();
float total_op_time =
at::native::vulkan::api::context()->querypool().get_total_op_ns(op_name) /
NANOSECONDS_IN_SECOND;
state.SetIterationTime(total_op_time);
}
#endif
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
// This function aggregate the latency of all invoked shaders except
// `vulkan.nchw_to_image` and `vulkan.image_to_nchw`, which are moving data
// between CPU and GPU memory.
static void extractTotalShaderResultsAndSetState(benchmark::State& state) {
at::native::vulkan::api::context()->querypool().extract_results();
uint64_t sum_shader_latency_in_nanoseconds = 0;
auto result_aggregator =
[&sum_shader_latency_in_nanoseconds](
const at::native::vulkan::api::ShaderDuration& s) {
if (s.kernel_name != "vulkan.nchw_to_image" &&
s.kernel_name != "vulkan.image_to_nchw") {
sum_shader_latency_in_nanoseconds += s.execution_duration_ns;
}
};
at::native::vulkan::api::context()->querypool().shader_log_for_each(
result_aggregator);
float sum_shader_latency_in_seconds =
sum_shader_latency_in_nanoseconds / NANOSECONDS_IN_SECOND;
state.SetIterationTime(sum_shader_latency_in_seconds);
}
#endif
static void mm_benchmark(benchmark::State& state) {
// Guard
@ -183,9 +109,10 @@ static void create_linear_context_benchmark(benchmark::State& state) {
"vulkan.image_to_nchw") /
NANOSECONDS_IN_SECOND;
total_op_time +=
at::native::vulkan::api::context()->querypool().get_total_op_ns(
"vulkan.convert_channels_to_height_packed") /
NANOSECONDS_IN_SECOND;
at::native::vulkan::api::context()->querypool().get_total_op_ns(
"vulkan.convert_channels_to_height_packed") /
NANOSECONDS_IN_SECOND;
state.SetIterationTime(total_op_time);
at::native::vulkan::api::context()->querypool().print_results();