mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[BE] Remove bottleneck (#163210)
Some cleanup related to this RFC: https://github.com/pytorch/pytorch/issues/68742 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163210 Approved by: https://github.com/ezyang
This commit is contained in:
parent
cfb8aec1a4
commit
c43ccfbc2d
|
|
@ -964,7 +964,6 @@ exclude_patterns = [
|
|||
'test/jit/**', # should be run through test/test_jit.py
|
||||
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
|
||||
'test/fx/**', # should be run through test/test_fx.py
|
||||
'test/bottleneck_test/**', # excluded by test/run_test.py
|
||||
'test/package/**', # excluded by test/run_test.py
|
||||
'test/distributed/argparse_util_test.py',
|
||||
'test/distributed/bin/test_script.py',
|
||||
|
|
@ -1410,8 +1409,6 @@ exclude_patterns = [
|
|||
'torch/utils/benchmark/utils/timer.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
|
||||
'torch/utils/bottleneck/__init__.py',
|
||||
'torch/utils/bottleneck/__main__.py',
|
||||
'torch/utils/bundled_inputs.py',
|
||||
'torch/utils/checkpoint.py',
|
||||
'torch/utils/collect_env.py',
|
||||
|
|
|
|||
|
|
@ -1,62 +0,0 @@
|
|||
torch.utils.bottleneck
|
||||
======================
|
||||
|
||||
.. automodule:: torch.utils.bottleneck
|
||||
.. currentmodule:: torch.utils.bottleneck
|
||||
|
||||
`torch.utils.bottleneck` is a tool that can be used as an initial step for
|
||||
debugging bottlenecks in your program. It summarizes runs of your script with
|
||||
the Python profiler and PyTorch's autograd profiler.
|
||||
|
||||
Run it on the command line with
|
||||
|
||||
::
|
||||
|
||||
python -m torch.utils.bottleneck /path/to/source/script.py [args]
|
||||
|
||||
where [args] are any number of arguments to `script.py`, or run
|
||||
``python -m torch.utils.bottleneck -h`` for more usage instructions.
|
||||
|
||||
.. warning::
|
||||
Because your script will be profiled, please ensure that it exits in a
|
||||
finite amount of time.
|
||||
|
||||
.. warning::
|
||||
Due to the asynchronous nature of CUDA kernels, when running against
|
||||
CUDA code, the cProfile output and CPU-mode autograd profilers may
|
||||
not show correct timings: the reported CPU time reports the amount of time
|
||||
used to launch the kernels but does not include the time the kernel
|
||||
spent executing on a GPU unless the operation does a synchronize.
|
||||
Ops that do synchronize appear to be extremely expensive under regular
|
||||
CPU-mode profilers.
|
||||
In these case where timings are incorrect, the CUDA-mode autograd profiler
|
||||
may be helpful.
|
||||
|
||||
.. note::
|
||||
To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
|
||||
look at, you should first check if your script is CPU-bound
|
||||
("CPU total time is much greater than CUDA total time").
|
||||
If it is CPU-bound, looking at the results of the CPU-mode autograd
|
||||
profiler will help. If on the other hand your script spends most of its
|
||||
time executing on the GPU, then it makes sense to start
|
||||
looking for responsible CUDA operators in the output of the CUDA-mode
|
||||
autograd profiler.
|
||||
|
||||
Of course the reality is much more complicated and your script might not be
|
||||
in one of those two extremes depending on the part of the model you're
|
||||
evaluating. If the profiler outputs don't help, you could try looking at
|
||||
the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
|
||||
However, please take into account that the NVTX overhead is very high and
|
||||
often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
|
||||
helps to analyze performance on Intel platforms further with
|
||||
:func:`torch.autograd.profiler.emit_itt()`.
|
||||
|
||||
.. warning::
|
||||
If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
|
||||
(cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
|
||||
in its time reporting. This should not matter if your bottlenecks result
|
||||
in code much slower than the CUDA startup time.
|
||||
|
||||
For more complicated uses of the profilers (like in a multi-GPU case),
|
||||
please see https://docs.python.org/3/library/profile.html
|
||||
or :func:`torch.autograd.profiler.profile()` for more information.
|
||||
|
|
@ -76,7 +76,6 @@ storage
|
|||
torch.testing <testing>
|
||||
torch.utils <utils>
|
||||
torch.utils.benchmark <benchmark_utils>
|
||||
torch.utils.bottleneck <bottleneck>
|
||||
torch.utils.checkpoint <checkpoint>
|
||||
torch.utils.cpp_extension <cpp_extension>
|
||||
torch.utils.data <data>
|
||||
|
|
|
|||
|
|
@ -1,7 +0,0 @@
|
|||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
x = torch.ones((3, 3), requires_grad=True)
|
||||
(3 * x).sum().backward()
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required args. Raises error if they aren't passed.
|
||||
parser.add_argument("--foo", help="foo", required=True)
|
||||
parser.add_argument("--bar", help="bar", required=True)
|
||||
_ = parser.parse_args()
|
||||
|
||||
x = torch.ones((3, 3), requires_grad=True)
|
||||
(3 * x).sum().backward()
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(20, 20)
|
||||
|
||||
def forward(self, input):
|
||||
out = self.linear(input[:, 10:30])
|
||||
return out.sum()
|
||||
|
||||
|
||||
def main():
|
||||
data = torch.randn(10, 50).cuda()
|
||||
model = Model().cuda()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
|
||||
for _ in range(10):
|
||||
optimizer.zero_grad()
|
||||
loss = model(data)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
|
@ -633,151 +632,6 @@ class TestDataLoaderUtils(TestCase):
|
|||
test_dir = os.path.abspath(os.path.dirname(str(__file__)))
|
||||
|
||||
|
||||
@unittest.skipIf(
|
||||
"SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
|
||||
)
|
||||
class TestBottleneck(TestCase):
|
||||
def _run(self, command, timeout=30):
|
||||
"""Returns (return-code, stdout, stderr)"""
|
||||
import subprocess
|
||||
|
||||
p = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
shell=True,
|
||||
)
|
||||
try:
|
||||
output, err = p.communicate(timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
p.kill()
|
||||
output, err = p.communicate()
|
||||
rc = p.returncode
|
||||
output_str = output.decode("ascii")
|
||||
err_str = err.decode("ascii")
|
||||
return (rc, output_str, err_str)
|
||||
|
||||
def _run_bottleneck(self, test_file, scriptargs=""):
|
||||
curdir = os.path.dirname(os.path.abspath(__file__))
|
||||
filepath = f"{curdir}/{test_file}"
|
||||
if scriptargs != "":
|
||||
scriptargs = f" {scriptargs}"
|
||||
rc, out, err = self._run(
|
||||
f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
|
||||
)
|
||||
return rc, out, err
|
||||
|
||||
def _check_run_args(self):
|
||||
# Check that this fails due to missing args
|
||||
rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
|
||||
self.assertEqual(
|
||||
rc,
|
||||
2,
|
||||
atol=0,
|
||||
rtol=0,
|
||||
msg=self._fail_msg("Missing args should error", out + err),
|
||||
)
|
||||
|
||||
# This should succeed
|
||||
rc, out, err = self._run_bottleneck(
|
||||
"bottleneck_test/test_args.py", "--foo foo --bar bar"
|
||||
)
|
||||
self.assertEqual(
|
||||
rc,
|
||||
0,
|
||||
atol=0,
|
||||
rtol=0,
|
||||
msg=self._fail_msg("Should pass args to script", out + err),
|
||||
)
|
||||
|
||||
def _fail_msg(self, msg, output):
|
||||
return f"{msg}, output was:\n{output}"
|
||||
|
||||
def _check_environment_summary(self, output):
|
||||
results = re.search("Environment Summary", output)
|
||||
self.assertIsNotNone(
|
||||
results, self._fail_msg("Should have Environment Summary", output)
|
||||
)
|
||||
|
||||
# Up to five lines away from the heading, there should be the version number
|
||||
results = re.search(
|
||||
r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
|
||||
)
|
||||
self.assertIsNotNone(
|
||||
results, self._fail_msg("Should have PyTorch version", output)
|
||||
)
|
||||
|
||||
def _check_cprof_summary(self, output):
|
||||
results = re.search("cProfile output", output)
|
||||
self.assertIsNotNone(
|
||||
results, self._fail_msg("Should have cProfile output", output)
|
||||
)
|
||||
|
||||
# This assumes that after the cProfile output section we have
|
||||
# the autograd profiler output
|
||||
results = re.search(
|
||||
r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
|
||||
)
|
||||
self.assertIsNotNone(
|
||||
results,
|
||||
self._fail_msg(
|
||||
"Distance between cProfile and autograd prof out not in [6, 50] lines",
|
||||
output,
|
||||
),
|
||||
)
|
||||
|
||||
def _check_autograd_summary(self, output):
|
||||
results = re.search("autograd profiler output", output)
|
||||
self.assertIsNotNone(
|
||||
results, self._fail_msg("Should have autograd profiler output", output)
|
||||
)
|
||||
|
||||
# This assumes that after the autograd profiler output is the end of the
|
||||
# output.
|
||||
results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
|
||||
self.assertIsNotNone(
|
||||
results,
|
||||
self._fail_msg(
|
||||
"Distance between autograd prof output and end of output not in [6, 100] lines",
|
||||
output,
|
||||
),
|
||||
)
|
||||
|
||||
def _check_cuda(self, output):
|
||||
if HAS_CUDA:
|
||||
results = re.search("CUDA mode", output)
|
||||
self.assertIsNotNone(
|
||||
results, self._fail_msg("Should tell users CUDA", output)
|
||||
)
|
||||
else:
|
||||
results = re.search("CUDA mode", output)
|
||||
self.assertIsNone(
|
||||
results, self._fail_msg("Should not tell users about CUDA", output)
|
||||
)
|
||||
|
||||
@unittest.skipIf(HAS_CUDA, "CPU-only test")
|
||||
def test_bottleneck_cpu_only(self):
|
||||
rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
|
||||
self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
|
||||
|
||||
self._check_run_args()
|
||||
self._check_environment_summary(out)
|
||||
self._check_autograd_summary(out)
|
||||
self._check_cprof_summary(out)
|
||||
self._check_cuda(out)
|
||||
|
||||
@unittest.skipIf(not HAS_CUDA, "No CUDA")
|
||||
def test_bottleneck_cuda(self):
|
||||
rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
|
||||
self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
|
||||
|
||||
self._check_run_args()
|
||||
self._check_environment_summary(out)
|
||||
self._check_autograd_summary(out)
|
||||
self._check_cprof_summary(out)
|
||||
self._check_cuda(out)
|
||||
|
||||
|
||||
from torch.utils.collect_env import get_pretty_env_info
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -73,7 +73,6 @@ TESTS = discover_tests(
|
|||
cpp_tests_dir=CPP_TESTS_DIR,
|
||||
blocklisted_patterns=[
|
||||
"ao",
|
||||
"bottleneck_test",
|
||||
"custom_backend",
|
||||
"custom_operator",
|
||||
"fx", # executed by test_fx.py
|
||||
|
|
|
|||
|
|
@ -1,229 +0,0 @@
|
|||
# mypy: allow-untyped-defs
|
||||
import argparse
|
||||
import cProfile
|
||||
import pstats
|
||||
import sys
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.autograd import profiler
|
||||
from torch.utils.collect_env import get_env_info
|
||||
|
||||
|
||||
def redirect_argv(new_argv):
|
||||
sys.argv[:] = new_argv[:]
|
||||
|
||||
|
||||
def compiled_with_cuda(sysinfo):
|
||||
if sysinfo.cuda_compiled_version:
|
||||
return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}'
|
||||
return 'not compiled w/ CUDA'
|
||||
|
||||
|
||||
env_summary = """
|
||||
--------------------------------------------------------------------------------
|
||||
Environment Summary
|
||||
--------------------------------------------------------------------------------
|
||||
PyTorch {pytorch_version}{debug_str} {cuda_compiled}
|
||||
Running with Python {py_version} and {cuda_runtime}
|
||||
|
||||
`{pip_version} list` truncated output:
|
||||
{pip_list_output}
|
||||
""".strip()
|
||||
|
||||
|
||||
def run_env_analysis():
|
||||
print('Running environment analysis...')
|
||||
info = get_env_info()
|
||||
|
||||
result: dict[str, str] = {}
|
||||
|
||||
debug_str = ''
|
||||
if info.is_debug_build:
|
||||
debug_str = ' DEBUG'
|
||||
|
||||
cuda_avail = ''
|
||||
if info.is_cuda_available:
|
||||
cuda = info.cuda_runtime_version
|
||||
if cuda is not None:
|
||||
cuda_avail = 'CUDA ' + cuda
|
||||
else:
|
||||
cuda = 'CUDA unavailable'
|
||||
|
||||
pip_version = info.pip_version
|
||||
pip_list_output = info.pip_packages
|
||||
if pip_list_output is None:
|
||||
pip_list_output = 'Unable to fetch'
|
||||
|
||||
result = {
|
||||
'debug_str': debug_str,
|
||||
'pytorch_version': info.torch_version,
|
||||
'cuda_compiled': compiled_with_cuda(info),
|
||||
'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}',
|
||||
'cuda_runtime': cuda_avail,
|
||||
'pip_version': pip_version,
|
||||
'pip_list_output': pip_list_output,
|
||||
}
|
||||
|
||||
return env_summary.format(**result)
|
||||
|
||||
|
||||
def run_cprofile(code, globs, launch_blocking=False):
|
||||
print('Running your script with cProfile')
|
||||
prof = cProfile.Profile()
|
||||
prof.enable()
|
||||
exec(code, globs, None)
|
||||
prof.disable()
|
||||
return prof
|
||||
|
||||
|
||||
cprof_summary = """
|
||||
--------------------------------------------------------------------------------
|
||||
cProfile output
|
||||
--------------------------------------------------------------------------------
|
||||
""".strip()
|
||||
|
||||
|
||||
def print_cprofile_summary(prof, sortby='tottime', topk=15):
|
||||
print(cprof_summary)
|
||||
cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
|
||||
cprofile_stats.print_stats(topk)
|
||||
|
||||
|
||||
def run_autograd_prof(code, globs):
|
||||
def run_prof(use_cuda=False):
|
||||
with profiler.profile(use_cuda=use_cuda) as prof:
|
||||
exec(code, globs, None)
|
||||
return prof
|
||||
|
||||
print('Running your script with the autograd profiler...')
|
||||
result = [run_prof(use_cuda=False)]
|
||||
if torch.cuda.is_available():
|
||||
result.append(run_prof(use_cuda=True))
|
||||
else:
|
||||
result.append(None)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
autograd_prof_summary = """
|
||||
--------------------------------------------------------------------------------
|
||||
autograd profiler output ({mode} mode)
|
||||
--------------------------------------------------------------------------------
|
||||
{description}
|
||||
{cuda_warning}
|
||||
{output}
|
||||
""".strip()
|
||||
|
||||
|
||||
def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
|
||||
valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
|
||||
if sortby not in valid_sortby:
|
||||
warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
|
||||
'Expected `cpu_time`, `cpu_time_total`, or `count`. '
|
||||
'Defaulting to `cpu_time`.')
|
||||
print(warn.format(sortby))
|
||||
sortby = 'cpu_time'
|
||||
|
||||
if mode == 'CUDA':
|
||||
cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
|
||||
'\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
|
||||
'\tPlease ignore this output if your code does not use CUDA.\n')
|
||||
else:
|
||||
cuda_warning = ''
|
||||
|
||||
sorted_events = sorted(prof.function_events,
|
||||
key=lambda x: getattr(x, sortby), reverse=True)
|
||||
topk_events = sorted_events[:topk]
|
||||
|
||||
result = {
|
||||
'mode': mode,
|
||||
'description': f'top {topk} events sorted by {sortby}',
|
||||
'output': torch.autograd.profiler_util._build_table(topk_events),
|
||||
'cuda_warning': cuda_warning
|
||||
}
|
||||
|
||||
print(autograd_prof_summary.format(**result))
|
||||
|
||||
|
||||
descript = """
|
||||
`bottleneck` is a tool that can be used as an initial step for debugging
|
||||
bottlenecks in your program.
|
||||
|
||||
It summarizes runs of your script with the Python profiler and PyTorch\'s
|
||||
autograd profiler. Because your script will be profiled, please ensure that it
|
||||
exits in a finite amount of time.
|
||||
|
||||
For more complicated uses of the profilers, please see
|
||||
https://docs.python.org/3/library/profile.html and
|
||||
https://pytorch.org/docs/main/autograd.html#profiler for more information.
|
||||
""".strip()
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description=descript)
|
||||
parser.add_argument('scriptfile', type=str,
|
||||
help='Path to the script to be run. '
|
||||
'Usually run with `python path/to/script`.')
|
||||
parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
|
||||
help='Command-line arguments to be passed to the script.')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def cpu_time_total(autograd_prof):
|
||||
return sum(event.cpu_time_total for event in autograd_prof.function_events)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Customizable constants.
|
||||
scriptfile = args.scriptfile
|
||||
scriptargs = [] if args.args is None else args.args
|
||||
scriptargs.insert(0, scriptfile)
|
||||
cprofile_sortby = 'tottime'
|
||||
cprofile_topk = 15
|
||||
autograd_prof_sortby = 'cpu_time_total'
|
||||
autograd_prof_topk = 15
|
||||
|
||||
redirect_argv(scriptargs)
|
||||
|
||||
sys.path.insert(0, os.path.dirname(scriptfile))
|
||||
with open(scriptfile, 'rb') as stream:
|
||||
code = compile(stream.read(), scriptfile, 'exec')
|
||||
globs = {
|
||||
'__file__': scriptfile,
|
||||
'__name__': '__main__',
|
||||
'__package__': None,
|
||||
'__cached__': None,
|
||||
}
|
||||
|
||||
print(descript)
|
||||
|
||||
env_summary = run_env_analysis()
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.init()
|
||||
cprofile_prof = run_cprofile(code, globs)
|
||||
autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
|
||||
|
||||
print(env_summary)
|
||||
print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
|
||||
return
|
||||
|
||||
# Print both the result of the CPU-mode and CUDA-mode autograd profilers
|
||||
# if their execution times are very different.
|
||||
cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
|
||||
if len(autograd_prof_cpu.function_events) > 0:
|
||||
cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
|
||||
pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time
|
||||
if abs(pct_diff) > 0.05:
|
||||
print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
|
||||
|
||||
print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user