mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: https://github.com/pytorch/pytorch/issues/109263 Show the shape of tensorlist when the length is < 30. Test Plan: {F1097707985} and unit tests Reviewed By: davidberard98 Differential Revision: D49351902 Pull Request resolved: https://github.com/pytorch/pytorch/pull/109751 Approved by: https://github.com/davidberard98
272 lines
8.1 KiB
C++
272 lines
8.1 KiB
C++
#pragma once
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <list>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include <ATen/record_function.h>
|
|
#include <c10/macros/Macros.h>
|
|
#include <c10/util/Optional.h>
|
|
#include <c10/util/hash.h>
|
|
#include <torch/csrc/Export.h>
|
|
#include <torch/csrc/jit/frontend/source_range.h>
|
|
|
|
#ifndef _WIN32
|
|
#include <ctime>
|
|
#endif
|
|
#if defined(C10_IOS) && defined(C10_MOBILE)
|
|
#include <sys/time.h> // for gettimeofday()
|
|
#endif
|
|
|
|
#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
|
|
#define C10_RDTSC
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#elif defined(__CUDACC__) || defined(__HIPCC__)
|
|
#undef C10_RDTSC
|
|
#elif defined(__clang__)
|
|
// `__rdtsc` is available by default.
|
|
// NB: This has to be first, because Clang will also define `__GNUC__`
|
|
#elif defined(__GNUC__)
|
|
#include <x86intrin.h>
|
|
#else
|
|
#undef C10_RDTSC
|
|
#endif
|
|
#endif
|
|
|
|
// TODO: replace with pytorch/rfcs#43 when it is ready.
|
|
#define SOFT_ASSERT(cond, ...) \
|
|
[&]() -> bool { \
|
|
if (C10_UNLIKELY(!(cond))) { \
|
|
torch::profiler::impl::logSoftAssert( \
|
|
__func__, \
|
|
__FILE__, \
|
|
static_cast<uint32_t>(__LINE__), \
|
|
#cond, \
|
|
::c10::str(__VA_ARGS__)); \
|
|
if (torch::profiler::impl::softAssertRaises()) { \
|
|
TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__); \
|
|
} else { \
|
|
TORCH_WARN(__VA_ARGS__); \
|
|
} \
|
|
return false; \
|
|
} \
|
|
return true; \
|
|
}()
|
|
|
|
namespace torch {
|
|
namespace profiler {
|
|
namespace impl {
|
|
TORCH_API bool softAssertRaises();
|
|
TORCH_API void setSoftAssertRaises(c10::optional<bool> value);
|
|
TORCH_API void logSoftAssert(
|
|
const char* func,
|
|
const char* file,
|
|
uint32_t line,
|
|
const char* cond,
|
|
const char* args);
|
|
TORCH_API inline void logSoftAssert(
|
|
const char* func,
|
|
const char* file,
|
|
uint32_t line,
|
|
const char* cond,
|
|
::c10::detail::CompileTimeEmptyString args) {
|
|
logSoftAssert(func, file, line, cond, (const char*)args);
|
|
}
|
|
TORCH_API void logSoftAssert(
|
|
const char* func,
|
|
const char* file,
|
|
uint32_t line,
|
|
const char* cond,
|
|
const std::string& args);
|
|
|
|
using time_t = int64_t;
|
|
using steady_clock_t = std::conditional<
|
|
std::chrono::high_resolution_clock::is_steady,
|
|
std::chrono::high_resolution_clock,
|
|
std::chrono::steady_clock>::type;
|
|
|
|
using shape =
|
|
std::variant<std::vector<int64_t>, std::vector<std::vector<int64_t>>>;
|
|
constexpr int TENSOR_LIST_DISPLAY_LENGTH_LIMIT = 30;
|
|
|
|
inline time_t getTimeSinceEpoch() {
|
|
auto now = std::chrono::system_clock::now().time_since_epoch();
|
|
return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
|
|
}
|
|
|
|
inline time_t getTime(bool allow_monotonic = false) {
|
|
#if defined(C10_IOS) && defined(C10_MOBILE)
|
|
// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
|
|
// can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
|
|
// is implemented or not
|
|
struct timeval now;
|
|
gettimeofday(&now, NULL);
|
|
return static_cast<time_t>(now.tv_sec) * 1000000000 +
|
|
static_cast<time_t>(now.tv_usec) * 1000;
|
|
#elif defined(_WIN32) || defined(__MACH__)
|
|
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
steady_clock_t::now().time_since_epoch())
|
|
.count();
|
|
#else
|
|
// clock_gettime is *much* faster than std::chrono implementation on Linux
|
|
struct timespec t {};
|
|
auto mode = CLOCK_REALTIME;
|
|
if (allow_monotonic) {
|
|
mode = CLOCK_MONOTONIC;
|
|
}
|
|
clock_gettime(mode, &t);
|
|
return static_cast<time_t>(t.tv_sec) * 1000000000 +
|
|
static_cast<time_t>(t.tv_nsec);
|
|
#endif
|
|
}
|
|
|
|
// We often do not need to capture true wall times. If a fast mechanism such
|
|
// as TSC is available we can use that instead and convert back to epoch time
|
|
// during post processing. This greatly reduce the clock's contribution to
|
|
// profiling.
|
|
// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
|
|
// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
|
|
// TODO: We should use
|
|
// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
|
|
inline auto getApproximateTime() {
|
|
#if defined(C10_RDTSC)
|
|
return static_cast<uint64_t>(__rdtsc());
|
|
#else
|
|
return getTime();
|
|
#endif
|
|
}
|
|
|
|
using approx_time_t = decltype(getApproximateTime());
|
|
static_assert(
|
|
std::is_same<approx_time_t, int64_t>::value ||
|
|
std::is_same<approx_time_t, uint64_t>::value,
|
|
"Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
|
|
|
|
// Convert `getCount` results to Nanoseconds since unix epoch.
|
|
class ApproximateClockToUnixTimeConverter final {
|
|
public:
|
|
ApproximateClockToUnixTimeConverter();
|
|
std::function<time_t(approx_time_t)> makeConverter();
|
|
|
|
struct UnixAndApproximateTimePair {
|
|
time_t t_;
|
|
approx_time_t approx_t_;
|
|
};
|
|
static UnixAndApproximateTimePair measurePair();
|
|
|
|
private:
|
|
static constexpr size_t replicates = 1001;
|
|
using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
|
|
time_pairs measurePairs();
|
|
|
|
time_pairs start_times_;
|
|
};
|
|
|
|
std::string getNvtxStr(
|
|
const char* name,
|
|
int64_t sequence_nr,
|
|
const std::vector<std::vector<int64_t>>& shapes,
|
|
at::RecordFunctionHandle op_id = 0,
|
|
const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids =
|
|
{});
|
|
|
|
struct TORCH_API FileLineFunc {
|
|
std::string filename;
|
|
size_t line;
|
|
std::string funcname;
|
|
};
|
|
|
|
TORCH_API std::vector<FileLineFunc> prepareCallstack(
|
|
const std::vector<jit::StackEntry>& cs);
|
|
TORCH_API std::vector<std::string> callstackStr(
|
|
const std::vector<FileLineFunc>& cs);
|
|
TORCH_API std::string stacksToStr(
|
|
const std::vector<std::string>& stacks,
|
|
const char* delim);
|
|
TORCH_API std::vector<std::vector<int64_t>> inputSizes(
|
|
const at::RecordFunction& fn,
|
|
const bool flatten_list_enabled = false);
|
|
TORCH_API std::string variantShapesToStr(const std::vector<shape>& shapes);
|
|
TORCH_API std::string shapesToStr(
|
|
const std::vector<std::vector<int64_t>>& shapes);
|
|
TORCH_API std::string strListToStr(const std::vector<std::string>& types);
|
|
TORCH_API std::string inputOpIdsToStr(
|
|
const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids);
|
|
TORCH_API std::string ivalueListToStr(const std::vector<c10::IValue>& list);
|
|
TORCH_API std::vector<std::string> inputTypes(const at::RecordFunction& fn);
|
|
|
|
std::unordered_map<std::string, c10::IValue> TORCH_API
|
|
saveExtraArgs(const at::RecordFunction& fn);
|
|
|
|
uint64_t TORCH_API computeFlops(
|
|
const std::string& op_name,
|
|
const std::unordered_map<std::string, c10::IValue>& extra_args);
|
|
|
|
std::string shapeToStr(const std::vector<int64_t>& shape);
|
|
|
|
template <typename T>
|
|
class TORCH_API GlobalStateManager {
|
|
public:
|
|
static GlobalStateManager& singleton() {
|
|
static GlobalStateManager singleton_;
|
|
return singleton_;
|
|
}
|
|
|
|
static void push(std::shared_ptr<T>&& state) {
|
|
if (singleton().state_) {
|
|
LOG(WARNING) << "GlobalStatePtr already exists!";
|
|
} else {
|
|
singleton().state_ = std::move(state);
|
|
}
|
|
}
|
|
|
|
static auto* get() {
|
|
return singleton().state_.get();
|
|
}
|
|
|
|
static std::shared_ptr<T> pop() {
|
|
auto out = singleton().state_;
|
|
singleton().state_.reset();
|
|
return out;
|
|
}
|
|
|
|
private:
|
|
GlobalStateManager() = default;
|
|
|
|
std::shared_ptr<T> state_;
|
|
};
|
|
|
|
struct HashCombine {
|
|
template <typename T0, typename T1>
|
|
size_t operator()(const std::pair<T0, T1>& i) {
|
|
return c10::get_hash((*this)(i.first), (*this)(i.second));
|
|
}
|
|
|
|
template <typename... Args>
|
|
size_t operator()(const std::tuple<Args...>& i) {
|
|
return c10::get_hash(i);
|
|
}
|
|
|
|
template <typename T>
|
|
size_t operator()(const T& i) {
|
|
return c10::get_hash(i);
|
|
}
|
|
};
|
|
|
|
} // namespace impl
|
|
} // namespace profiler
|
|
} // namespace torch
|
|
|
|
namespace torch {
|
|
namespace autograd {
|
|
namespace profiler {
|
|
using torch::profiler::impl::computeFlops;
|
|
using torch::profiler::impl::getTime;
|
|
} // namespace profiler
|
|
} // namespace autograd
|
|
} // namespace torch
|