mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Prerequisite change for enabling `-Werror=sign-compare` across PyTorch repo Pull Request resolved: https://github.com/pytorch/pytorch/pull/75082 Approved by: https://github.com/ngimel
331 lines
9.3 KiB
C++
331 lines
9.3 KiB
C++
#include "observers/perf_observer.h"
|
|
#include "observers/observer_config.h"
|
|
#ifndef C10_MOBILE
|
|
#include "caffe2/core/flags.h"
|
|
#include "observers/net_observer_reporter_print.h"
|
|
#endif
|
|
|
|
#include <random>
|
|
// NOLINTNEXTLINE(modernize-deprecated-headers)
|
|
#include <time.h>
|
|
#include "caffe2/core/common.h"
|
|
#include "caffe2/core/init.h"
|
|
#include "caffe2/core/operator.h"
|
|
|
|
#if defined(TARGET_OS_MAC) || \
|
|
defined(TARGET_OS_IPHONE) || \
|
|
defined(TARGET_IPHONE_SIMULATOR)
|
|
#define _APPLE 1
|
|
#endif
|
|
|
|
#ifdef _WIN32
|
|
#ifndef WIN32_LEAN_AND_MEAN
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#endif
|
|
#include <windows.h>
|
|
#endif
|
|
|
|
#ifdef _APPLE
|
|
#include <mach/mach_time.h>
|
|
#include <sys/time.h>
|
|
#include <sys/resource.h>
|
|
#endif
|
|
|
|
#ifndef C10_MOBILE
|
|
C10_DEFINE_int64(
|
|
aiBench_netInitSampleRate,
|
|
0,
|
|
"One in N sampling rate for net delay");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_netFollowupSampleRate,
|
|
0,
|
|
"One in N sampling rate for net delay");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_netFollowupSampleCount,
|
|
0,
|
|
"control the following c logs");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_operatorNetSampleRatio,
|
|
0,
|
|
"One in N sampling rate for operator delay");
|
|
|
|
C10_DEFINE_int64(
|
|
aiBench_skipIters,
|
|
0,
|
|
"skip the first N iterations of the net run");
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
namespace {
|
|
|
|
bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
|
|
AddGlobalNetObserverCreator([](NetBase* subject) {
|
|
return std::make_unique<PerfNetObserver>(subject);
|
|
});
|
|
|
|
#if !defined(C10_MOBILE)
|
|
// for aibench usage
|
|
caffe2::ObserverConfig::setReporter(
|
|
std::make_unique<caffe2::NetObserverReporterPrint>());
|
|
|
|
caffe2::ObserverConfig::initSampleRate(
|
|
FLAGS_aiBench_netInitSampleRate,
|
|
FLAGS_aiBench_netFollowupSampleRate,
|
|
FLAGS_aiBench_netFollowupSampleCount,
|
|
FLAGS_aiBench_operatorNetSampleRatio,
|
|
FLAGS_aiBench_skipIters);
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
} // namespace
|
|
|
|
#ifdef _WIN32
|
|
double getTicksPerMillisecond() {
|
|
static LARGE_INTEGER ticks_per_sec;
|
|
if (!ticks_per_sec.QuadPart) {
|
|
QueryPerformanceFrequency(&ticks_per_sec);
|
|
if (!ticks_per_sec.QuadPart) {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
|
|
}
|
|
#elif !defined _APPLE
|
|
double getClockTimeMilliseconds(clockid_t clk_id) {
|
|
int result;
|
|
struct timespec tp;
|
|
result = clock_gettime(clk_id, &tp);
|
|
if (result == -1) {
|
|
return 0.0;
|
|
} else {
|
|
return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
double getWallClockTimeMilliseconds() {
|
|
#ifdef _WIN32
|
|
double ticks_per_ms = getTicksPerMillisecond();
|
|
if (ticks_per_ms) {
|
|
LARGE_INTEGER ticks;
|
|
if (QueryPerformanceCounter(&ticks)) {
|
|
return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
|
|
}
|
|
}
|
|
|
|
return 0.0;
|
|
#elif defined _APPLE
|
|
static mach_timebase_info_data_t info;
|
|
if (info.denom == 0) {
|
|
mach_timebase_info(&info);
|
|
}
|
|
|
|
uint64_t now = mach_absolute_time();
|
|
now = now * info.numer / info.denom; // convert to nanoseconds
|
|
return now / 1000000.0;
|
|
#else
|
|
return getClockTimeMilliseconds(CLOCK_MONOTONIC);
|
|
#endif
|
|
}
|
|
|
|
double getCpuTimeMilliseconds() {
|
|
#ifdef _WIN32
|
|
FILETIME creation_time;
|
|
FILETIME exit_time;
|
|
FILETIME kernel_time;
|
|
FILETIME user_time;
|
|
if (GetProcessTimes(
|
|
GetCurrentProcess(),
|
|
&creation_time,
|
|
&exit_time,
|
|
&kernel_time,
|
|
&user_time)) {
|
|
ULARGE_INTEGER kernel;
|
|
ULARGE_INTEGER user;
|
|
kernel.HighPart = kernel_time.dwHighDateTime;
|
|
kernel.LowPart = kernel_time.dwLowDateTime;
|
|
user.HighPart = user_time.dwHighDateTime;
|
|
user.LowPart = user_time.dwLowDateTime;
|
|
return (static_cast<double>(kernel.QuadPart) +
|
|
static_cast<double>(user.QuadPart)) / 10000.0;
|
|
}
|
|
|
|
return 0.0;
|
|
#elif defined _APPLE
|
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
|
struct rusage ru;
|
|
if (getrusage(RUSAGE_SELF, &ru)) {
|
|
return 0.0;
|
|
}
|
|
|
|
return ru.ru_utime.tv_sec * 1000.0
|
|
+ ru.ru_utime.tv_usec / 1000.0
|
|
+ ru.ru_stime.tv_sec * 1000.0
|
|
+ ru.ru_stime.tv_usec / 1000.0;
|
|
#else
|
|
return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
|
|
#endif
|
|
}
|
|
|
|
REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
|
|
registerGlobalPerfNetObserverCreator,
|
|
®isterGlobalPerfNetObserverCreator,
|
|
"Caffe2 net global observer creator");
|
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
|
PerfNetObserver::PerfNetObserver(NetBase* subject_)
|
|
: NetObserver(subject_), numRuns_(0) {}
|
|
|
|
// NOLINTNEXTLINE(modernize-use-equals-default)
|
|
PerfNetObserver::~PerfNetObserver() {}
|
|
|
|
void PerfNetObserver::Start() {
|
|
static int visitCount = 0;
|
|
// Select whether to log the operator or the net.
|
|
// We have one sample rate for the entire app.
|
|
int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
|
|
int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
|
|
int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
|
|
int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
|
|
int skipIters = ObserverConfig::getSkipIters();
|
|
int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
|
|
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
|
|
if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
|
|
visitCount++;
|
|
if (visitCount == netFollowupSampleCount) {
|
|
visitCount = 0;
|
|
}
|
|
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
|
|
if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
|
|
logType_ = PerfNetObserver::OPERATOR_DELAY;
|
|
} else {
|
|
logType_ = PerfNetObserver::NET_DELAY;
|
|
}
|
|
} else {
|
|
logType_ = PerfNetObserver::NONE;
|
|
}
|
|
numRuns_++;
|
|
|
|
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
|
/* Always recreate new operator observers
|
|
whenever we measure operator delay */
|
|
const auto& operators = subject_->GetOperators();
|
|
for (auto* op : operators) {
|
|
observerMap_[op] = op->AttachObserver(
|
|
std::make_unique<PerfOperatorObserver>(op, this));
|
|
}
|
|
}
|
|
|
|
wallMilliseconds_ = getWallClockTimeMilliseconds();
|
|
cpuMilliseconds_ = getCpuTimeMilliseconds();
|
|
}
|
|
|
|
void PerfNetObserver::Stop() {
|
|
if (logType_ == PerfNetObserver::NONE) {
|
|
return;
|
|
}
|
|
std::map<std::string, PerformanceInformation> info;
|
|
PerformanceInformation net_perf;
|
|
net_perf.cpuMilliseconds =
|
|
getCpuTimeMilliseconds() - cpuMilliseconds_;
|
|
net_perf.latency =
|
|
getWallClockTimeMilliseconds() - wallMilliseconds_;
|
|
|
|
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
|
const auto& operators = subject_->GetOperators();
|
|
for (unsigned idx = 0; idx < operators.size(); ++idx) {
|
|
const auto* op = operators[idx];
|
|
auto name = getObserverName(op, static_cast<int>(idx));
|
|
PerformanceInformation p;
|
|
const PerfOperatorObserver* opObserver =
|
|
static_cast<const PerfOperatorObserver*>(observerMap_[op]);
|
|
p.latency = opObserver->getWallMilliseconds();
|
|
p.cpuMilliseconds = opObserver->getCpuMilliseconds();
|
|
p.engine = op->engine();
|
|
p.type = op->type();
|
|
p.tensor_shapes =
|
|
static_cast<const PerfOperatorObserver*>(observerMap_[op])
|
|
->getTensorShapes();
|
|
|
|
if (op->has_debug_def()) {
|
|
// NOLINTNEXTLINE(performance-for-range-copy)
|
|
for (auto arg : op->debug_def().arg()) {
|
|
p.args.emplace_back(arg);
|
|
}
|
|
}
|
|
|
|
info.insert({name, p});
|
|
}
|
|
|
|
/* clear all operator delay after use so that we don't spent time
|
|
collecting the operator delay info in later runs */
|
|
for (auto* op : operators) {
|
|
op->DetachObserver(observerMap_[op]);
|
|
}
|
|
observerMap_.clear();
|
|
}
|
|
info.insert({"NET_DELAY", net_perf});
|
|
ObserverConfig::getReporter()->report(subject_, info);
|
|
}
|
|
|
|
caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
|
|
const {
|
|
string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
|
|
string displayName =
|
|
(op->has_debug_def() ? op->debug_def().name().size()
|
|
? op->debug_def().name()
|
|
: (op->debug_def().output_size() ? op->debug_def().output(0)
|
|
: "NO_OUTPUT")
|
|
: "NO_DEF");
|
|
caffe2::string name =
|
|
"ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
|
|
return name;
|
|
}
|
|
|
|
PerfOperatorObserver::PerfOperatorObserver(
|
|
OperatorBase* op,
|
|
PerfNetObserver* netObserver)
|
|
: ObserverBase<OperatorBase>(op),
|
|
netObserver_(netObserver),
|
|
wallMilliseconds_(0),
|
|
cpuMilliseconds_(0) {
|
|
CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
|
|
}
|
|
|
|
// NOLINTNEXTLINE(modernize-use-equals-default)
|
|
PerfOperatorObserver::~PerfOperatorObserver() {}
|
|
|
|
void PerfOperatorObserver::Start() {
|
|
wallMilliseconds_ = getWallClockTimeMilliseconds();
|
|
cpuMilliseconds_ = getCpuTimeMilliseconds();
|
|
}
|
|
|
|
void PerfOperatorObserver::Stop() {
|
|
/* Time from the start of the net minus the time spent on all other
|
|
operators is the time spent on this operator */
|
|
cpuMilliseconds_ =
|
|
getCpuTimeMilliseconds() - cpuMilliseconds_;
|
|
wallMilliseconds_ =
|
|
getWallClockTimeMilliseconds() - wallMilliseconds_;
|
|
tensor_shapes_ = subject_->InputTensorShapes();
|
|
}
|
|
|
|
double PerfOperatorObserver::getWallMilliseconds() const {
|
|
return wallMilliseconds_;
|
|
}
|
|
|
|
double PerfOperatorObserver::getCpuMilliseconds() const {
|
|
return cpuMilliseconds_;
|
|
}
|
|
|
|
std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
|
|
return tensor_shapes_;
|
|
}
|
|
|
|
} // namespace caffe2
|