mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[PyTorch][Vulkan] Refactor performance test binary (#114712)
Summary: We create two files `vulkan_perf_utils.h` and `vulkan_perf_utils.cpp` which hosts several shared functions among the `perf_test` source files: - `makeStack` - `callOpByHandle` - `callOpByName` - `extractTotalShaderResultsAndSetState` - `extractTotalOpResultsAndSetState` so that they can be used for all perf tests. Test Plan: We test `vulkan_conv_arithmetic_perf_test`, `vulkan_layernorm_perf_test` and `vulkan_mm_perf_test` respectively as below. - build binary, at `fbsource` ``` buck2 build -c ndk.debug_info_level=0 -c ndk.static_linking=true -c pt.enable_qpl=0 -c pt.vulkan_use_gpu_diagnostics=1 --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_layernorm_perf_test_binAndroid --show-output -c pt.vulkan_full_precision=1 buck2 build -c ndk.debug_info_level=0 -c ndk.static_linking=true -c pt.enable_qpl=0 -c pt.vulkan_use_gpu_diagnostics=1 --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_conv_arithmetic_perf_test_binAndroid --show-output -c pt.vulkan_full_precision=1 buck2 build -c ndk.debug_info_level=0 -c ndk.static_linking=true -c pt.enable_qpl=0 -c pt.vulkan_use_gpu_diagnostics=1 --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_mm_perf_test_binAndroid --show-output -c pt.vulkan_full_precision=1 ``` - push to device ``` adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_conv_arithmetic_perf_test_binAndroid__/pt_vulkan_conv_arithmetic_perf_test_binAndroid /data/local/tmp adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_mm_perf_test_binAndroid__/pt_vulkan_mm_perf_test_binAndroid /data/local/tmp adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_mm_perf_test_binAndroid__/pt_vulkan_mm_perf_test_binAndroid /data/local/tmp ``` - test on device ``` adb shell /data/local/tmp/pt_vulkan_mm_perf_test_binAndroid adb shell /data/local/tmp/pt_vulkan_layernorm_perf_test_binAndroid adb shell /data/local/tmp/pt_vulkan_conv_arithmetic_perf_test_binAndroid ``` full results: vulkan_mm_perf_test: P887658084 vulkan_layernorm_perf_test P887687924 vulkan_conv_arithmetic_perf_test P887689880 Reviewed By: yipjustin, liuk22 Differential Revision: D51451751 Pull Request resolved: https://github.com/pytorch/pytorch/pull/114712 Approved by: https://github.com/yipjustin
This commit is contained in:
parent
62df4f3428
commit
6317a0350e
|
|
@ -1,133 +0,0 @@
|
|||
#include <unordered_map>
|
||||
#ifdef USE_VULKAN_API
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/native/vulkan/api/api.h>
|
||||
#include <ATen/native/vulkan/ops/Common.h>
|
||||
#include <ATen/native/vulkan/ops/Copy.h>
|
||||
#include <ATen/native/vulkan/ops/Factory.h>
|
||||
#include <ATen/native/vulkan/ops/QuantizedFunctions.h>
|
||||
#include <ATen/native/vulkan/ops/Utils.h>
|
||||
|
||||
namespace {
|
||||
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
static const float NANOSECONDS_IN_SECOND = 1000000000.0;
|
||||
#endif
|
||||
|
||||
template <class... Inputs>
|
||||
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
|
||||
return {std::forward<Inputs>(inputs)...};
|
||||
}
|
||||
|
||||
template <class... Args>
|
||||
inline std::vector<c10::IValue> callOpByHandle(
|
||||
const c10::OperatorHandle& op,
|
||||
Args... args) {
|
||||
auto stack = makeStack(std::forward<Args>(args)...);
|
||||
c10::Dispatcher::singleton().callBoxed(op, &stack);
|
||||
return stack;
|
||||
}
|
||||
|
||||
template <class... Args>
|
||||
inline std::vector<c10::IValue> callOpByName(
|
||||
const char* func_name,
|
||||
const char* overload_name,
|
||||
Args... args) {
|
||||
const c10::optional<c10::OperatorHandle> op_handle =
|
||||
c10::Dispatcher::singleton().findSchema({func_name, overload_name});
|
||||
assert(op_handle.has_value());
|
||||
return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
static void CommonMMBenchmarkSettings(benchmark::internal::Benchmark* b) {
|
||||
b->Unit(benchmark::kMillisecond);
|
||||
b->ArgNames({"N", "M", "P"});
|
||||
}
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
// This function aggregate the latency of all invoked shaders except
|
||||
// `vulkan.nchw_to_image` and `vulkan.image_to_nchw`, which are moving data
|
||||
// between CPU and GPU memory.
|
||||
static void extractTotalShaderResultsAndSetState(benchmark::State& state) {
|
||||
at::native::vulkan::api::context()->querypool().extract_results();
|
||||
|
||||
uint64_t sum_shader_latency_in_nanoseconds = 0;
|
||||
auto result_aggregator =
|
||||
[&sum_shader_latency_in_nanoseconds](
|
||||
const at::native::vulkan::api::ShaderDuration& s) {
|
||||
if (s.kernel_name != "vulkan.nchw_to_image" &&
|
||||
s.kernel_name != "vulkan.image_to_nchw") {
|
||||
sum_shader_latency_in_nanoseconds += s.execution_duration_ns;
|
||||
}
|
||||
};
|
||||
at::native::vulkan::api::context()->querypool().shader_log_for_each(
|
||||
result_aggregator);
|
||||
|
||||
float sum_shader_latency_in_seconds =
|
||||
sum_shader_latency_in_nanoseconds / NANOSECONDS_IN_SECOND;
|
||||
state.SetIterationTime(sum_shader_latency_in_seconds);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void layer_norm_benchmark(benchmark::State& state) {
|
||||
// Guard
|
||||
if (!at::is_vulkan_available()) {
|
||||
return;
|
||||
}
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
at::native::vulkan::api::context()->enable_op_profiling();
|
||||
at::native::vulkan::api::context()->reset_querypool();
|
||||
#endif
|
||||
|
||||
c10::InferenceMode mode;
|
||||
|
||||
// Arrange
|
||||
const auto c = state.range(0);
|
||||
const auto h = state.range(1);
|
||||
const auto w = state.range(2);
|
||||
|
||||
const auto in_cpu =
|
||||
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto in_vulkan = in_cpu.vulkan();
|
||||
|
||||
const auto weight_cpu =
|
||||
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto weight_vulkan = weight_cpu.vulkan();
|
||||
|
||||
const auto bias_cpu =
|
||||
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto bias_vulkan = bias_cpu.vulkan();
|
||||
|
||||
// Act
|
||||
for (auto _ : state) {
|
||||
const auto vulkan_out =
|
||||
at::layer_norm(
|
||||
in_vulkan, {c, h, w}, weight_vulkan, bias_vulkan, 1e-05, false)
|
||||
.cpu();
|
||||
}
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
extractTotalShaderResultsAndSetState(state);
|
||||
at::native::vulkan::api::context()->querypool().print_results();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
const uint32_t BENCHMARK_MM_N = 75;
|
||||
const uint32_t BENCHMARK_MM_M = 75;
|
||||
const uint32_t BENCHMARK_MM_P = 75;
|
||||
const uint32_t BENCHMARK_ITERATIONS = 50;
|
||||
|
||||
BENCHMARK(layer_norm_benchmark)
|
||||
->Apply(CommonMMBenchmarkSettings)
|
||||
->UseManualTime()
|
||||
->Threads(1)
|
||||
->Iterations(BENCHMARK_ITERATIONS)
|
||||
->Args({BENCHMARK_MM_N, BENCHMARK_MM_M, BENCHMARK_MM_P});
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
|
||||
#endif /* USE_VULKAN_API */
|
||||
|
|
@ -1,15 +1,7 @@
|
|||
#include <unordered_map>
|
||||
#ifdef USE_VULKAN_API
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/native/vulkan/api/api.h>
|
||||
#include <ATen/native/vulkan/ops/Common.h>
|
||||
#include <ATen/native/vulkan/ops/Copy.h>
|
||||
#include <ATen/native/vulkan/ops/Factory.h>
|
||||
#include <ATen/native/vulkan/ops/QuantizedFunctions.h>
|
||||
#include <ATen/native/vulkan/ops/Utils.h>
|
||||
#include "vulkan_perf_utils.h"
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
#include <iostream>
|
||||
#endif
|
||||
|
|
@ -18,10 +10,6 @@ namespace {
|
|||
|
||||
namespace vulkan_api = at::native::vulkan::api;
|
||||
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
static const float NANOSECONDS_IN_SECOND = 1000000000.0;
|
||||
#endif
|
||||
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
void report_pep(const std::string& name, const uint64_t duration) {
|
||||
std::stringstream buffer;
|
||||
|
|
@ -61,18 +49,6 @@ void report_aibench_res(vulkan_api::QueryPool& qpool) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
static void extractTotalOpResultsAndSetState(
|
||||
benchmark::State& state,
|
||||
const char* op_name) {
|
||||
at::native::vulkan::api::context()->querypool().extract_results();
|
||||
float total_op_time =
|
||||
at::native::vulkan::api::context()->querypool().get_total_op_ns(op_name) /
|
||||
NANOSECONDS_IN_SECOND;
|
||||
state.SetIterationTime(total_op_time);
|
||||
}
|
||||
#endif
|
||||
|
||||
at::Tensor vulkan_to_cpu(const at::Tensor& vulkan, const at::Tensor& in_cpu) {
|
||||
auto q_options = in_cpu.options();
|
||||
if (q_options.dtype().toScalarType() == c10::ScalarType::QUInt8) {
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
#include <unordered_map>
|
||||
#ifdef USE_VULKAN_API
|
||||
|
||||
#include "vulkan_perf_utils.h"
|
||||
|
||||
namespace {
|
||||
|
||||
static void CommonMMBenchmarkSettings(benchmark::internal::Benchmark* b) {
|
||||
b->Unit(benchmark::kMillisecond);
|
||||
b->ArgNames({"N", "M", "P"});
|
||||
}
|
||||
|
||||
static void layer_norm_benchmark(benchmark::State& state) {
|
||||
// Guard
|
||||
if (!at::is_vulkan_available()) {
|
||||
return;
|
||||
}
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
at::native::vulkan::api::context()->enable_op_profiling();
|
||||
at::native::vulkan::api::context()->reset_querypool();
|
||||
#endif
|
||||
|
||||
c10::InferenceMode mode;
|
||||
|
||||
// Arrange
|
||||
const auto c = state.range(0);
|
||||
const auto h = state.range(1);
|
||||
const auto w = state.range(2);
|
||||
|
||||
const auto in_cpu =
|
||||
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto in_vulkan = in_cpu.vulkan();
|
||||
|
||||
const auto weight_cpu =
|
||||
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto weight_vulkan = weight_cpu.vulkan();
|
||||
|
||||
const auto bias_cpu =
|
||||
at::rand({c, h, w}, at::device(at::kCPU).dtype(at::kFloat));
|
||||
const auto bias_vulkan = bias_cpu.vulkan();
|
||||
|
||||
// Act
|
||||
for (auto _ : state) {
|
||||
const auto vulkan_out =
|
||||
at::layer_norm(
|
||||
in_vulkan, {c, h, w}, weight_vulkan, bias_vulkan, 1e-05, false)
|
||||
.cpu();
|
||||
}
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
extractTotalShaderResultsAndSetState(state);
|
||||
at::native::vulkan::api::context()->querypool().print_results();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
const uint32_t BENCHMARK_MM_N = 75;
|
||||
const uint32_t BENCHMARK_MM_M = 75;
|
||||
const uint32_t BENCHMARK_MM_P = 75;
|
||||
const uint32_t BENCHMARK_ITERATIONS = 50;
|
||||
|
||||
BENCHMARK(layer_norm_benchmark)
|
||||
->Apply(CommonMMBenchmarkSettings)
|
||||
->UseManualTime()
|
||||
->Threads(1)
|
||||
->Iterations(BENCHMARK_ITERATIONS)
|
||||
->Args({BENCHMARK_MM_N, BENCHMARK_MM_M, BENCHMARK_MM_P});
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
|
||||
#endif /* USE_VULKAN_API */
|
||||
|
|
@ -1,88 +1,14 @@
|
|||
#include <unordered_map>
|
||||
#ifdef USE_VULKAN_API
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/native/vulkan/api/api.h>
|
||||
#include <ATen/native/vulkan/ops/Common.h>
|
||||
#include <ATen/native/vulkan/ops/Copy.h>
|
||||
#include <ATen/native/vulkan/ops/Factory.h>
|
||||
#include <ATen/native/vulkan/ops/QuantizedFunctions.h>
|
||||
#include <ATen/native/vulkan/ops/Utils.h>
|
||||
#include "vulkan_perf_utils.h"
|
||||
|
||||
namespace {
|
||||
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
static const float NANOSECONDS_IN_SECOND = 1000000000.0;
|
||||
#endif
|
||||
|
||||
template <class... Inputs>
|
||||
inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
|
||||
return {std::forward<Inputs>(inputs)...};
|
||||
}
|
||||
|
||||
template <class... Args>
|
||||
inline std::vector<c10::IValue> callOpByHandle(
|
||||
const c10::OperatorHandle& op,
|
||||
Args... args) {
|
||||
auto stack = makeStack(std::forward<Args>(args)...);
|
||||
c10::Dispatcher::singleton().callBoxed(op, &stack);
|
||||
return stack;
|
||||
}
|
||||
|
||||
template <class... Args>
|
||||
inline std::vector<c10::IValue> callOpByName(
|
||||
const char* func_name,
|
||||
const char* overload_name,
|
||||
Args... args) {
|
||||
const c10::optional<c10::OperatorHandle> op_handle =
|
||||
c10::Dispatcher::singleton().findSchema({func_name, overload_name});
|
||||
assert(op_handle.has_value());
|
||||
return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
static void CommonMMBenchmarkSettings(benchmark::internal::Benchmark* b) {
|
||||
b->Unit(benchmark::kMillisecond);
|
||||
b->ArgNames({"N", "M", "P"});
|
||||
}
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
static void extractTotalOpResultsAndSetState(
|
||||
benchmark::State& state,
|
||||
const char* op_name) {
|
||||
at::native::vulkan::api::context()->querypool().extract_results();
|
||||
float total_op_time =
|
||||
at::native::vulkan::api::context()->querypool().get_total_op_ns(op_name) /
|
||||
NANOSECONDS_IN_SECOND;
|
||||
state.SetIterationTime(total_op_time);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
|
||||
// This function aggregate the latency of all invoked shaders except
|
||||
// `vulkan.nchw_to_image` and `vulkan.image_to_nchw`, which are moving data
|
||||
// between CPU and GPU memory.
|
||||
static void extractTotalShaderResultsAndSetState(benchmark::State& state) {
|
||||
at::native::vulkan::api::context()->querypool().extract_results();
|
||||
|
||||
uint64_t sum_shader_latency_in_nanoseconds = 0;
|
||||
auto result_aggregator =
|
||||
[&sum_shader_latency_in_nanoseconds](
|
||||
const at::native::vulkan::api::ShaderDuration& s) {
|
||||
if (s.kernel_name != "vulkan.nchw_to_image" &&
|
||||
s.kernel_name != "vulkan.image_to_nchw") {
|
||||
sum_shader_latency_in_nanoseconds += s.execution_duration_ns;
|
||||
}
|
||||
};
|
||||
at::native::vulkan::api::context()->querypool().shader_log_for_each(
|
||||
result_aggregator);
|
||||
|
||||
float sum_shader_latency_in_seconds =
|
||||
sum_shader_latency_in_nanoseconds / NANOSECONDS_IN_SECOND;
|
||||
state.SetIterationTime(sum_shader_latency_in_seconds);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void mm_benchmark(benchmark::State& state) {
|
||||
// Guard
|
||||
|
|
@ -183,9 +109,10 @@ static void create_linear_context_benchmark(benchmark::State& state) {
|
|||
"vulkan.image_to_nchw") /
|
||||
NANOSECONDS_IN_SECOND;
|
||||
total_op_time +=
|
||||
at::native::vulkan::api::context()->querypool().get_total_op_ns(
|
||||
"vulkan.convert_channels_to_height_packed") /
|
||||
NANOSECONDS_IN_SECOND;
|
||||
at::native::vulkan::api::context()->querypool().get_total_op_ns(
|
||||
"vulkan.convert_channels_to_height_packed") /
|
||||
NANOSECONDS_IN_SECOND;
|
||||
|
||||
state.SetIterationTime(total_op_time);
|
||||
|
||||
at::native::vulkan::api::context()->querypool().print_results();
|
||||
Loading…
Reference in New Issue
Block a user