mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
torch.profiler.record_function is relatively slow; for example, in some benchmarks I was running, x.view_as(x) was ~2us, and ~16-17us when wrapped in a record_function context. The reasons for this are: dispatcher overhead from going through an op (the main source of overhead), python binding / python conversion overhead, and some overhead from the context manager.
This new implementation is faster, but it won't work with torchscript. Based on the benchmarks I was running, it adds 0.5-0.7us overhead per call when the profiler is turned off. To use it, you can just:
```python
with torch._C._profiler_manual._RecordFunctionFast("title"):
torch.add(x, y)
```
It implements a context manager in python which directly calls the record_function utilities, instead of calling through an op.
* The context manager is implemented directly in python because the overhead from calling a python function seems non-negligible
* All the record_function calls, python object conversions are guarded on checks for whether the profiler is enabled or not. It seems like this saves a few hundred nanoseconds.
For more details about the experiments I ran to choose this implementation, see [my record_functions experiments branch](https://github.com/pytorch/pytorch/compare/main...davidberard98:pytorch:record-function-fast-experiments?expand=1).
This also adds a `torch.autograd.profiler._is_profiler_enabled` global variable that can be used to check whether a profiler is currently enabled. It's useful for further reducing the overhead, like this:
```python
if torch.autograd.profiler._is_profiler_enabled:
with torch._C._profiler_manual._RecordFunctionFast("title"):
torch.add(x, y)
else:
torch.add(x, y)
```
On BERT_pytorch (CPU-bound model), if we add a record_function inside CachedAutotuning.run:
* Naive torch.profiler.record_function() is a ~30% slowdown
* Always wrapping with RecordFunctionFast causes a regression of ~2-4%.
* Guarding with an if statement - any regression is within noise
**Selected benchmark results**: these come from a 2.20GHz machine, GPU build but only running CPU ops; running `x.view_as(x)`, with various record_functions applied (with profiling turned off). For more detailed results see "record_functions experiments branch" linked above (those results are on a different machine, but show the same patterns). Note that the results are somewhat noisy, assume 0.05-0.1us variations
```
Baseline:: 1.7825262546539307 us # Just running x.view_as(x)
profiled_basic:: 13.600390434265137 us # torch.profiler.record_function(x) + view_as
precompute_manual_cm_rf:: 2.317216396331787 us # torch._C._profiler_manual._RecordFunctionFast(), if the context is pre-constructed + view_as
guard_manual_cm_rf:: 1.7994389533996582 us # guard with _is_profiler_enabled + view_as
```
Differential Revision: [D48421198](https://our.internmc.facebook.com/intern/diff/D48421198)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/107195
Approved by: https://github.com/albanD, https://github.com/aaronenyeshi
240 lines
6.1 KiB
Python
240 lines
6.1 KiB
Python
from enum import Enum
|
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
|
|
from torch._C import device, dtype, layout
|
|
from typing_extensions import TypeAlias
|
|
|
|
# defined in torch/csrc/profiler/python/init.cpp
|
|
|
|
class RecordScope(Enum):
|
|
FUNCTION = ...
|
|
BACKWARD_FUNCTION = ...
|
|
TORCHSCRIPT_FUNCTION = ...
|
|
KERNEL_FUNCTION_DTYPE = ...
|
|
CUSTOM_CLASS = ...
|
|
BUILD_FEATURE = ...
|
|
LITE_INTERPRETER = ...
|
|
USER_SCOPE = ...
|
|
STATIC_RUNTIME_OP = ...
|
|
STATIC_RUNTIME_MODEL = ...
|
|
|
|
class ProfilerState(Enum):
|
|
Disable = ...
|
|
CPU = ...
|
|
CUDA = ...
|
|
NVTX = ...
|
|
ITT = ...
|
|
KINETO = ...
|
|
KINETO_GPU_FALLBACK = ...
|
|
KINETO_PRIVATEUSE1_FALLBACK = ...
|
|
KINETO_PRIVATEUSE1 = ...
|
|
|
|
class ActiveProfilerType(Enum):
|
|
NONE = ...
|
|
LEGACY = ...
|
|
KINETO = ...
|
|
NVTX = ...
|
|
ITT = ...
|
|
|
|
class ProfilerActivity(Enum):
|
|
CPU = ...
|
|
CUDA = ...
|
|
MTIA = ...
|
|
PrivateUse1 = ...
|
|
|
|
class _EventType(Enum):
|
|
TorchOp = ...
|
|
Backend = ...
|
|
Allocation = ...
|
|
OutOfMemory = ...
|
|
PyCall = ...
|
|
PyCCall = ...
|
|
Kineto = ...
|
|
|
|
class _ExperimentalConfig:
|
|
def __init__(
|
|
self,
|
|
profiler_metrics: List[str] = ...,
|
|
profiler_measure_per_kernel: bool = ...,
|
|
verbose: bool = ...,
|
|
performance_events: List[str] = ...,
|
|
privateuse1_config: Dict = ...,
|
|
enable_cuda_sync_events: bool = ...,
|
|
) -> None: ...
|
|
|
|
class ProfilerConfig:
|
|
def __init__(
|
|
self,
|
|
state: ProfilerState,
|
|
report_input_shapes: bool,
|
|
profile_memory: bool,
|
|
with_stack: bool,
|
|
with_flops: bool,
|
|
with_modules: bool,
|
|
experimental_config: _ExperimentalConfig,
|
|
) -> None: ...
|
|
|
|
class _ProfilerEvent:
|
|
start_tid: int
|
|
start_time_ns: int
|
|
children: List[_ProfilerEvent]
|
|
|
|
# TODO(robieta): remove in favor of `self.typed`
|
|
extra_fields: Union[
|
|
_ExtraFields_TorchOp,
|
|
_ExtraFields_Backend,
|
|
_ExtraFields_Allocation,
|
|
_ExtraFields_OutOfMemory,
|
|
_ExtraFields_PyCall,
|
|
_ExtraFields_PyCCall,
|
|
_ExtraFields_Kineto,
|
|
]
|
|
|
|
@property
|
|
def typed(
|
|
self,
|
|
) -> Union[
|
|
Tuple[Literal[_EventType.TorchOp], _ExtraFields_TorchOp],
|
|
Tuple[Literal[_EventType.Backend], _ExtraFields_Backend],
|
|
Tuple[Literal[_EventType.Allocation], _ExtraFields_Allocation],
|
|
Tuple[Literal[_EventType.OutOfMemory], _ExtraFields_OutOfMemory],
|
|
Tuple[Literal[_EventType.PyCall], _ExtraFields_PyCall],
|
|
Tuple[Literal[_EventType.PyCCall], _ExtraFields_PyCCall],
|
|
Tuple[Literal[_EventType.Kineto], _ExtraFields_Kineto],
|
|
]: ...
|
|
@property
|
|
def name(self) -> str: ...
|
|
@property
|
|
def tag(self) -> _EventType: ...
|
|
@property
|
|
def id(self) -> int: ...
|
|
@property
|
|
def parent(self) -> Optional[_ProfilerEvent]: ...
|
|
@property
|
|
def correlation_id(self) -> int: ...
|
|
@property
|
|
def end_time_ns(self) -> int: ...
|
|
@property
|
|
def duration_time_ns(self) -> int: ...
|
|
|
|
class _TensorMetadata:
|
|
impl_ptr: Optional[int]
|
|
storage_data_ptr: Optional[int]
|
|
id: Optional[int]
|
|
|
|
@property
|
|
def allocation_id(self) -> Optional[int]: ...
|
|
@property
|
|
def layout(self) -> layout: ...
|
|
@property
|
|
def device(self) -> device: ...
|
|
@property
|
|
def dtype(self) -> dtype: ...
|
|
@property
|
|
def sizes(self) -> List[int]: ...
|
|
@property
|
|
def strides(self) -> List[int]: ...
|
|
|
|
Scalar: TypeAlias = Union[int, float, bool, complex]
|
|
Input: TypeAlias = Optional[Union[_TensorMetadata, List[_TensorMetadata], Scalar]]
|
|
|
|
class _ExtraFields_TorchOp:
|
|
name: str
|
|
sequence_number: int
|
|
allow_tf32_cublas: bool
|
|
|
|
@property
|
|
def inputs(self) -> List[Input]: ...
|
|
@property
|
|
def scope(self) -> RecordScope: ...
|
|
|
|
class _ExtraFields_Backend: ...
|
|
|
|
class _ExtraFields_Allocation:
|
|
ptr: int
|
|
id: Optional[int]
|
|
alloc_size: int
|
|
total_allocated: int
|
|
total_reserved: int
|
|
|
|
@property
|
|
def allocation_id(self) -> Optional[int]: ...
|
|
@property
|
|
def device(self) -> device: ...
|
|
|
|
class _ExtraFields_OutOfMemory: ...
|
|
|
|
class _PyFrameState:
|
|
line_number: int
|
|
function_name: str
|
|
|
|
@property
|
|
def file_name(self) -> str: ...
|
|
|
|
class _NNModuleInfo:
|
|
@property
|
|
def self_ptr(self) -> int: ...
|
|
@property
|
|
def cls_ptr(self) -> int: ...
|
|
@property
|
|
def cls_name(self) -> str: ...
|
|
@property
|
|
def parameters(
|
|
self,
|
|
) -> List[Tuple[str, _TensorMetadata, Optional[_TensorMetadata]]]: ...
|
|
|
|
class _OptimizerInfo:
|
|
@property
|
|
def parameters(
|
|
self,
|
|
) -> List[
|
|
Tuple[
|
|
# Parameter
|
|
_TensorMetadata,
|
|
#
|
|
# Gradient (if present during optimizer.step())
|
|
Optional[_TensorMetadata],
|
|
#
|
|
# Optimizer state for Parameter as (name, tensor) pairs
|
|
List[Tuple[str, _TensorMetadata]],
|
|
]
|
|
]: ...
|
|
|
|
class _ExtraFields_PyCCall:
|
|
@property
|
|
def caller(self) -> _PyFrameState: ...
|
|
|
|
class _ExtraFields_PyCall:
|
|
@property
|
|
def callsite(self) -> _PyFrameState: ...
|
|
@property
|
|
def caller(self) -> _PyFrameState: ...
|
|
@property
|
|
def module(self) -> Optional[_NNModuleInfo]: ...
|
|
@property
|
|
def optimizer(self) -> Optional[_OptimizerInfo]: ...
|
|
|
|
class _ExtraFields_Kineto: ...
|
|
|
|
def _add_execution_trace_observer(output_file_path: str) -> bool: ...
|
|
def _remove_execution_trace_observer() -> None: ...
|
|
def _enable_execution_trace_observer() -> None: ...
|
|
def _disable_execution_trace_observer() -> None: ...
|
|
def _set_record_concrete_inputs_enabled_val(val: bool) -> None: ...
|
|
def _set_fwd_bwd_enabled_val(val: bool) -> None: ...
|
|
def _set_cuda_sync_enabled_val(val: bool) -> None: ...
|
|
|
|
class CapturedTraceback: ...
|
|
|
|
def gather_traceback(python: bool, script: bool, cpp: bool) -> CapturedTraceback: ...
|
|
|
|
# The Dict has name, filename, line
|
|
def symbolize_tracebacks(
|
|
to_symbolize: List[CapturedTraceback],
|
|
) -> List[List[Dict[str, str]]]: ...
|
|
|
|
class _RecordFunctionFast:
|
|
def __init__(self, name: str) -> None: ...
|
|
def __enter__(self) -> None: ...
|
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: ...
|