mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-08 07:39:33 +01:00
Enables clang-tidy rule [`misc-use-internal-linkage`](https://clang.llvm.org/extra/clang-tidy/checks/misc/use-internal-linkage.html). This new check was introduced in Clang-Tidy 18 and is available due to recent update of Clang-Tidy 19. The check marks functions and variables used only in the translation unit as static. Therefore undesired symbols are not leaked into other units, more link time optimisations are possible and the resulting binaries may be smaller. The detected violations were mostly fixed by using static. In other cases, the symbols were indeed consumed by others files, then their declaring headers were included. Still some declarations were wrong and have been fixed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/148948 Approved by: https://github.com/Skylion007
195 lines
4.9 KiB
C++
195 lines
4.9 KiB
C++
#include <unordered_map>
|
|
#include <unordered_set>
|
|
|
|
#include <c10/util/error.h>
|
|
#include <torch/csrc/profiler/perf-inl.h>
|
|
#include <torch/csrc/profiler/perf.h>
|
|
|
|
namespace torch::profiler::impl::linux_perf {
|
|
|
|
#if defined(__ANDROID__) || defined(__linux__)
|
|
|
|
/*
|
|
* PerfEvent
|
|
* ---------
|
|
*/
|
|
|
|
/*
|
|
* Syscall wrapper for perf_event_open(2)
|
|
*/
|
|
inline static long perf_event_open(
|
|
struct perf_event_attr* hw_event,
|
|
pid_t pid,
|
|
int cpu,
|
|
int group_fd,
|
|
unsigned long flags) {
|
|
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
|
|
}
|
|
|
|
// TODO sync with Kineto level abstract events in profiler/events.h
|
|
static const std::unordered_map<
|
|
std::string,
|
|
std::pair<perf_type_id, /* perf event type */ uint32_t>>
|
|
EventTable{
|
|
{"cycles",
|
|
std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
|
|
{"instructions",
|
|
std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
|
|
|
|
// Non Standard events for testing
|
|
{"pagefaults",
|
|
std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
|
|
{"backend-stall-cycles",
|
|
std::make_pair(
|
|
PERF_TYPE_HARDWARE,
|
|
PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
|
|
{"frontend-stall-cycles",
|
|
std::make_pair(
|
|
PERF_TYPE_HARDWARE,
|
|
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
|
|
|
|
PerfEvent::~PerfEvent() {
|
|
if (fd_ > -1) {
|
|
close(fd_);
|
|
}
|
|
fd_ = -1; // poison
|
|
}
|
|
|
|
void PerfEvent::Init() {
|
|
TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
|
|
|
|
auto const it = EventTable.find(name_);
|
|
if (it == EventTable.end()) {
|
|
TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
|
|
}
|
|
|
|
struct perf_event_attr attr {};
|
|
|
|
attr.size = sizeof(perf_event_attr);
|
|
attr.type = it->second.first;
|
|
attr.config = it->second.second;
|
|
attr.disabled = 1;
|
|
attr.inherit = 1;
|
|
attr.exclude_kernel = 1; // TBD
|
|
attr.exclude_hv = 1;
|
|
/*
|
|
* These can be used to calculate estimated totals if the PMU is overcommitted
|
|
* and multiplexing is happening
|
|
*/
|
|
attr.read_format =
|
|
PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
|
|
|
|
pid_t pid = getpid(); // this pid
|
|
int cpu = -1; // all cpus
|
|
int group_fd = -1;
|
|
unsigned long flags = 0;
|
|
|
|
fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
|
|
if (fd_ == -1) {
|
|
TORCH_CHECK(
|
|
false,
|
|
"perf_event_open() failed, error: ",
|
|
c10::utils::str_error(errno));
|
|
}
|
|
Reset();
|
|
}
|
|
|
|
uint64_t PerfEvent::ReadCounter() const {
|
|
PerfCounter counter{};
|
|
long n = read(fd_, &counter, sizeof(PerfCounter));
|
|
TORCH_CHECK(
|
|
n == sizeof(counter),
|
|
"Read failed for Perf event fd, event : ",
|
|
name_,
|
|
", error: ",
|
|
c10::utils::str_error(errno));
|
|
TORCH_CHECK(
|
|
counter.time_enabled == counter.time_running,
|
|
"Hardware performance counter time multiplexing is not handled yet",
|
|
", name: ",
|
|
name_,
|
|
", enabled: ",
|
|
counter.time_enabled,
|
|
", running: ",
|
|
counter.time_running);
|
|
return counter.value;
|
|
}
|
|
|
|
#else /* __ANDROID__ || __linux__ */
|
|
/*
|
|
* Shim class for unsupported platforms - this will always return 0 counter
|
|
* value
|
|
*/
|
|
|
|
PerfEvent::~PerfEvent() {}
|
|
|
|
void PerfEvent::Init() {}
|
|
|
|
uint64_t PerfEvent::ReadCounter() const {
|
|
return 0;
|
|
}
|
|
|
|
#endif /* __ANDROID__ || __linux__ */
|
|
|
|
/*
|
|
* PerfProfiler
|
|
* ------------
|
|
*/
|
|
|
|
void PerfProfiler::Configure(std::vector<std::string>& event_names) {
|
|
TORCH_CHECK(
|
|
event_names.size() <= MAX_EVENTS,
|
|
"Too many events to configure, configured: ",
|
|
event_names.size(),
|
|
", max allowed:",
|
|
MAX_EVENTS);
|
|
std::unordered_set<std::string> s(event_names.begin(), event_names.end());
|
|
TORCH_CHECK(
|
|
s.size() == event_names.size(), "Duplicate event names are not allowed!")
|
|
for (auto name : event_names) {
|
|
events_.emplace_back(name);
|
|
events_.back().Init();
|
|
}
|
|
|
|
// TODO
|
|
// Reset pthreadpool here to make sure we can attach to new children
|
|
// threads
|
|
}
|
|
|
|
void PerfProfiler::Enable() {
|
|
if (!start_values_.empty()) {
|
|
StopCounting();
|
|
}
|
|
|
|
start_values_.emplace(events_.size(), 0);
|
|
|
|
auto& sv = start_values_.top();
|
|
for (unsigned i = 0; i < events_.size(); ++i) {
|
|
sv[i] = events_[i].ReadCounter();
|
|
}
|
|
StartCounting();
|
|
}
|
|
|
|
void PerfProfiler::Disable(perf_counters_t& vals) {
|
|
StopCounting();
|
|
TORCH_CHECK(
|
|
vals.size() == events_.size(),
|
|
"Can not fit all perf counters in the supplied container");
|
|
TORCH_CHECK(
|
|
!start_values_.empty(), "PerfProfiler must be enabled before disabling");
|
|
|
|
/* Always connecting this disable event to the last enable event i.e. using
|
|
* whatever is on the top of the start counter value stack. */
|
|
perf_counters_t& sv = start_values_.top();
|
|
for (unsigned i = 0; i < events_.size(); ++i) {
|
|
vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
|
|
}
|
|
start_values_.pop();
|
|
|
|
// Restore it for a parent
|
|
if (!start_values_.empty()) {
|
|
StartCounting();
|
|
}
|
|
}
|
|
} // namespace torch::profiler::impl::linux_perf
|