[BE] Remove bottleneck (#163210)

Some cleanup related to this RFC: https://github.com/pytorch/pytorch/issues/68742 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163210 Approved by: https://github.com/ezyang
2025-12-06 12:20:52 +01:00 · 2025-09-17 23:48:22 +00:00 · 2025-09-17 23:48:22 +00:00 · c43ccfbc2d
commit c43ccfbc2d
parent cfb8aec1a4
10 changed files with 0 additions and 495 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -964,7 +964,6 @@ exclude_patterns = [
    'test/jit/**',  # should be run through test/test_jit.py
    'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
    'test/fx/**',  # should be run through test/test_fx.py
-    'test/bottleneck_test/**',  # excluded by test/run_test.py
    'test/package/**',  # excluded by test/run_test.py
    'test/distributed/argparse_util_test.py',
    'test/distributed/bin/test_script.py',
@ -1410,8 +1409,6 @@ exclude_patterns = [
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
-    'torch/utils/bottleneck/__init__.py',
-    'torch/utils/bottleneck/__main__.py',
    'torch/utils/bundled_inputs.py',
    'torch/utils/checkpoint.py',
    'torch/utils/collect_env.py',
--- a/docs/source/bottleneck.rst
+++ b/docs/source/bottleneck.rst
@ -1,62 +0,0 @@
-torch.utils.bottleneck
-======================
-
-.. automodule:: torch.utils.bottleneck
-.. currentmodule:: torch.utils.bottleneck
-
-`torch.utils.bottleneck` is a tool that can be used as an initial step for
-debugging bottlenecks in your program. It summarizes runs of your script with
-the Python profiler and PyTorch's autograd profiler.
-
-Run it on the command line with
-
-::
-
-    python -m torch.utils.bottleneck /path/to/source/script.py [args]
-
-where [args] are any number of arguments to `script.py`, or run
-``python -m torch.utils.bottleneck -h`` for more usage instructions.
-
-.. warning::
-    Because your script will be profiled, please ensure that it exits in a
-    finite amount of time.
-
-.. warning::
-    Due to the asynchronous nature of CUDA kernels, when running against
-    CUDA code, the cProfile output and CPU-mode autograd profilers may
-    not show correct timings: the reported CPU time reports the amount of time
-    used to launch the kernels but does not include the time the kernel
-    spent executing on a GPU unless the operation does a synchronize.
-    Ops that do synchronize appear to be extremely expensive under regular
-    CPU-mode profilers.
-    In these case where timings are incorrect, the CUDA-mode autograd profiler
-    may be helpful.
-
-.. note::
-    To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
-    look at, you should first check if your script is CPU-bound
-    ("CPU total time is much greater than CUDA total time").
-    If it is CPU-bound, looking at the results of the CPU-mode autograd
-    profiler will help. If on the other hand your script spends most of its
-    time executing on the GPU, then it makes sense to start
-    looking for responsible CUDA operators in the output of the CUDA-mode
-    autograd profiler.
-
-    Of course the reality is much more complicated and your script might not be
-    in one of those two extremes depending on the part of the model you're
-    evaluating. If the profiler outputs don't help, you could try looking at
-    the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
-    However, please take into account that the NVTX overhead is very high and
-    often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
-    helps to analyze performance on Intel platforms further with
-    :func:`torch.autograd.profiler.emit_itt()`.
-
-.. warning::
-    If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
-    (cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
-    in its time reporting. This should not matter if your bottlenecks result
-    in code much slower than the CUDA startup time.
-
-For more complicated uses of the profilers (like in a multi-GPU case),
-please see https://docs.python.org/3/library/profile.html
-or :func:`torch.autograd.profiler.profile()` for more information.
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@ -76,7 +76,6 @@ storage
 torch.testing <testing>
 torch.utils <utils>
 torch.utils.benchmark <benchmark_utils>
-torch.utils.bottleneck <bottleneck>
 torch.utils.checkpoint <checkpoint>
 torch.utils.cpp_extension <cpp_extension>
 torch.utils.data <data>
--- a/test/bottleneck_test/test.py
+++ b/test/bottleneck_test/test.py
@ -1,7 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import torch
-
-
-x = torch.ones((3, 3), requires_grad=True)
-(3 * x).sum().backward()
--- a/test/bottleneck_test/test_args.py
+++ b/test/bottleneck_test/test_args.py
@ -1,17 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import argparse
-
-import torch
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required args. Raises error if they aren't passed.
-    parser.add_argument("--foo", help="foo", required=True)
-    parser.add_argument("--bar", help="bar", required=True)
-    _ = parser.parse_args()
-
-    x = torch.ones((3, 3), requires_grad=True)
-    (3 * x).sum().backward()
--- a/test/bottleneck_test/test_cuda.py
+++ b/test/bottleneck_test/test_cuda.py
@ -1,29 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import torch
-import torch.nn as nn
-
-
-class Model(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.linear = nn.Linear(20, 20)
-
-    def forward(self, input):
-        out = self.linear(input[:, 10:30])
-        return out.sum()
-
-
-def main():
-    data = torch.randn(10, 50).cuda()
-    model = Model().cuda()
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
-    for _ in range(10):
-        optimizer.zero_grad()
-        loss = model(data)
-        loss.backward()
-        optimizer.step()
-
-
-if __name__ == "__main__":
-    main()
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -3,7 +3,6 @@

 import os
 import random
-import re
 import shutil
 import subprocess
 import sys
@ -633,151 +632,6 @@ class TestDataLoaderUtils(TestCase):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))


-@unittest.skipIf(
-    "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
-)
-class TestBottleneck(TestCase):
-    def _run(self, command, timeout=30):
-        """Returns (return-code, stdout, stderr)"""
-        import subprocess
-
-        p = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True,
-        )
-        try:
-            output, err = p.communicate(timeout=timeout)
-        except subprocess.TimeoutExpired:
-            p.kill()
-            output, err = p.communicate()
-        rc = p.returncode
-        output_str = output.decode("ascii")
-        err_str = err.decode("ascii")
-        return (rc, output_str, err_str)
-
-    def _run_bottleneck(self, test_file, scriptargs=""):
-        curdir = os.path.dirname(os.path.abspath(__file__))
-        filepath = f"{curdir}/{test_file}"
-        if scriptargs != "":
-            scriptargs = f" {scriptargs}"
-        rc, out, err = self._run(
-            f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
-        )
-        return rc, out, err
-
-    def _check_run_args(self):
-        # Check that this fails due to missing args
-        rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
-        self.assertEqual(
-            rc,
-            2,
-            atol=0,
-            rtol=0,
-            msg=self._fail_msg("Missing args should error", out + err),
-        )
-
-        # This should succeed
-        rc, out, err = self._run_bottleneck(
-            "bottleneck_test/test_args.py", "--foo foo --bar bar"
-        )
-        self.assertEqual(
-            rc,
-            0,
-            atol=0,
-            rtol=0,
-            msg=self._fail_msg("Should pass args to script", out + err),
-        )
-
-    def _fail_msg(self, msg, output):
-        return f"{msg}, output was:\n{output}"
-
-    def _check_environment_summary(self, output):
-        results = re.search("Environment Summary", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have Environment Summary", output)
-        )
-
-        # Up to five lines away from the heading, there should be the version number
-        results = re.search(
-            r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
-        )
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have PyTorch version", output)
-        )
-
-    def _check_cprof_summary(self, output):
-        results = re.search("cProfile output", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have cProfile output", output)
-        )
-
-        # This assumes that after the cProfile output section we have
-        # the autograd profiler output
-        results = re.search(
-            r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
-        )
-        self.assertIsNotNone(
-            results,
-            self._fail_msg(
-                "Distance between cProfile and autograd prof out not in [6, 50] lines",
-                output,
-            ),
-        )
-
-    def _check_autograd_summary(self, output):
-        results = re.search("autograd profiler output", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have autograd profiler output", output)
-        )
-
-        # This assumes that after the autograd profiler output is the end of the
-        # output.
-        results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
-        self.assertIsNotNone(
-            results,
-            self._fail_msg(
-                "Distance between autograd prof output and end of output not in [6, 100] lines",
-                output,
-            ),
-        )
-
-    def _check_cuda(self, output):
-        if HAS_CUDA:
-            results = re.search("CUDA mode", output)
-            self.assertIsNotNone(
-                results, self._fail_msg("Should tell users CUDA", output)
-            )
-        else:
-            results = re.search("CUDA mode", output)
-            self.assertIsNone(
-                results, self._fail_msg("Should not tell users about CUDA", output)
-            )
-
-    @unittest.skipIf(HAS_CUDA, "CPU-only test")
-    def test_bottleneck_cpu_only(self):
-        rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
-        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
-
-        self._check_run_args()
-        self._check_environment_summary(out)
-        self._check_autograd_summary(out)
-        self._check_cprof_summary(out)
-        self._check_cuda(out)
-
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
-    def test_bottleneck_cuda(self):
-        rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
-        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
-
-        self._check_run_args()
-        self._check_environment_summary(out)
-        self._check_autograd_summary(out)
-        self._check_cprof_summary(out)
-        self._check_cuda(out)
-
-
 from torch.utils.collect_env import get_pretty_env_info


--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@ -73,7 +73,6 @@ TESTS = discover_tests(
    cpp_tests_dir=CPP_TESTS_DIR,
    blocklisted_patterns=[
        "ao",
-        "bottleneck_test",
        "custom_backend",
        "custom_operator",
        "fx",  # executed by test_fx.py
--- a/torch/utils/bottleneck/init.py
+++ b/torch/utils/bottleneck/init.py
--- a/torch/utils/bottleneck/main.py
+++ b/torch/utils/bottleneck/main.py
@ -1,229 +0,0 @@
-# mypy: allow-untyped-defs
-import argparse
-import cProfile
-import pstats
-import sys
-import os
-
-import torch
-from torch.autograd import profiler
-from torch.utils.collect_env import get_env_info
-
-
-def redirect_argv(new_argv):
-    sys.argv[:] = new_argv[:]
-
-
-def compiled_with_cuda(sysinfo):
-    if sysinfo.cuda_compiled_version:
-        return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}'
-    return 'not compiled w/ CUDA'
-
-
-env_summary = """
--------------------------------------------------------------------------------
-  Environment Summary
--------------------------------------------------------------------------------
-PyTorch {pytorch_version}{debug_str} {cuda_compiled}
-Running with Python {py_version} and {cuda_runtime}
-
-`{pip_version} list` truncated output:
-{pip_list_output}
-""".strip()
-
-
-def run_env_analysis():
-    print('Running environment analysis...')
-    info = get_env_info()
-
-    result: dict[str, str] = {}
-
-    debug_str = ''
-    if info.is_debug_build:
-        debug_str = ' DEBUG'
-
-    cuda_avail = ''
-    if info.is_cuda_available:
-        cuda = info.cuda_runtime_version
-        if cuda is not None:
-            cuda_avail = 'CUDA ' + cuda
-    else:
-        cuda = 'CUDA unavailable'
-
-    pip_version = info.pip_version
-    pip_list_output = info.pip_packages
-    if pip_list_output is None:
-        pip_list_output = 'Unable to fetch'
-
-    result = {
-        'debug_str': debug_str,
-        'pytorch_version': info.torch_version,
-        'cuda_compiled': compiled_with_cuda(info),
-        'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}',
-        'cuda_runtime': cuda_avail,
-        'pip_version': pip_version,
-        'pip_list_output': pip_list_output,
-    }
-
-    return env_summary.format(**result)
-
-
-def run_cprofile(code, globs, launch_blocking=False):
-    print('Running your script with cProfile')
-    prof = cProfile.Profile()
-    prof.enable()
-    exec(code, globs, None)
-    prof.disable()
-    return prof
-
-
-cprof_summary = """
--------------------------------------------------------------------------------
-  cProfile output
--------------------------------------------------------------------------------
-""".strip()
-
-
-def print_cprofile_summary(prof, sortby='tottime', topk=15):
-    print(cprof_summary)
-    cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
-    cprofile_stats.print_stats(topk)
-
-
-def run_autograd_prof(code, globs):
-    def run_prof(use_cuda=False):
-        with profiler.profile(use_cuda=use_cuda) as prof:
-            exec(code, globs, None)
-        return prof
-
-    print('Running your script with the autograd profiler...')
-    result = [run_prof(use_cuda=False)]
-    if torch.cuda.is_available():
-        result.append(run_prof(use_cuda=True))
-    else:
-        result.append(None)
-
-    return result
-
-
-autograd_prof_summary = """
--------------------------------------------------------------------------------
-  autograd profiler output ({mode} mode)
--------------------------------------------------------------------------------
-        {description}
-{cuda_warning}
-{output}
-""".strip()
-
-
-def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
-    valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
-    if sortby not in valid_sortby:
-        warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
-                'Expected `cpu_time`, `cpu_time_total`, or `count`. '
-                'Defaulting to `cpu_time`.')
-        print(warn.format(sortby))
-        sortby = 'cpu_time'
-
-    if mode == 'CUDA':
-        cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
-                        '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
-                        '\tPlease ignore this output if your code does not use CUDA.\n')
-    else:
-        cuda_warning = ''
-
-    sorted_events = sorted(prof.function_events,
-                           key=lambda x: getattr(x, sortby), reverse=True)
-    topk_events = sorted_events[:topk]
-
-    result = {
-        'mode': mode,
-        'description': f'top {topk} events sorted by {sortby}',
-        'output': torch.autograd.profiler_util._build_table(topk_events),
-        'cuda_warning': cuda_warning
-    }
-
-    print(autograd_prof_summary.format(**result))
-
-
-descript = """
-`bottleneck` is a tool that can be used as an initial step for debugging
-bottlenecks in your program.
-
-It summarizes runs of your script with the Python profiler and PyTorch\'s
-autograd profiler. Because your script will be profiled, please ensure that it
-exits in a finite amount of time.
-
-For more complicated uses of the profilers, please see
-https://docs.python.org/3/library/profile.html and
-https://pytorch.org/docs/main/autograd.html#profiler for more information.
-""".strip()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description=descript)
-    parser.add_argument('scriptfile', type=str,
-                        help='Path to the script to be run. '
-                        'Usually run with `python path/to/script`.')
-    parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
-                        help='Command-line arguments to be passed to the script.')
-    return parser.parse_args()
-
-
-def cpu_time_total(autograd_prof):
-    return sum(event.cpu_time_total for event in autograd_prof.function_events)
-
-
-def main():
-    args = parse_args()
-
-    # Customizable constants.
-    scriptfile = args.scriptfile
-    scriptargs = [] if args.args is None else args.args
-    scriptargs.insert(0, scriptfile)
-    cprofile_sortby = 'tottime'
-    cprofile_topk = 15
-    autograd_prof_sortby = 'cpu_time_total'
-    autograd_prof_topk = 15
-
-    redirect_argv(scriptargs)
-
-    sys.path.insert(0, os.path.dirname(scriptfile))
-    with open(scriptfile, 'rb') as stream:
-        code = compile(stream.read(), scriptfile, 'exec')
-    globs = {
-        '__file__': scriptfile,
-        '__name__': '__main__',
-        '__package__': None,
-        '__cached__': None,
-    }
-
-    print(descript)
-
-    env_summary = run_env_analysis()
-
-    if torch.cuda.is_available():
-        torch.cuda.init()
-    cprofile_prof = run_cprofile(code, globs)
-    autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
-
-    print(env_summary)
-    print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
-
-    if not torch.cuda.is_available():
-        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
-        return
-
-    # Print both the result of the CPU-mode and CUDA-mode autograd profilers
-    # if their execution times are very different.
-    cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
-    if len(autograd_prof_cpu.function_events) > 0:
-        cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
-        pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time
-        if abs(pct_diff) > 0.05:
-            print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
-
-    print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
-
-if __name__ == '__main__':
-    main()