pytorch/torch/_functorch/benchmark_utils.py
Richard Zou 4068c5467d [Reland] Move functorch/_src to torch/_functorch (#88756) (#90091)
This will be the last disruptive functorch internals change.

Why are we moving these files?
- As a part of rationalizing functorch we are moving the code in
functorch/_src to torch/_functorch
- This is so that we can offer the functorch APIs as native PyTorch APIs
(coming soon) and resolve some internal build issues.

Why are we moving all of these files at once?
- It's better to break developers all at once rather than many times

Test Plan:
- wait for tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90091
Approved by: https://github.com/anijain2305, https://github.com/ezyang
2022-12-03 14:17:15 +00:00

201 lines
6.0 KiB
Python

import time
import os
import json
import torch
from torch.profiler import profile, ProfilerActivity
def synchronize():
pass
class NullContext:
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
"""
Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
[num_runs] times to [trace_filename].
[activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
Return total runtime without the profiler
Outputs to trace_filename
"""
if devices is None:
devices = ["cuda"]
global synchronize
if devices != ["cpu"] and torch.cuda.is_available():
synchronize = torch.cuda.synchronize
if kwargs_for_f is None:
kwargs_for_f = {}
if kwargs_for_profiler is None:
kwargs_for_profiler = {}
with optimize_ctx:
torch.manual_seed(1337)
for _ in range(5): # warmup runs
f(input, **kwargs_for_f)
synchronize()
torch.manual_seed(1337)
t0 = time.perf_counter()
for _ in range(num_runs):
f(input, **kwargs_for_f)
synchronize()
t1 = time.perf_counter()
timing = t1 - t0
with profile(activities=activities, **kwargs_for_profiler) as prof:
with optimize_ctx:
synchronize()
torch.manual_seed(1337)
for _ in range(num_runs):
f(input, **kwargs_for_f)
synchronize()
prof.export_chrome_trace(trace_filename)
return timing
def get_chrome_trace_events(filename):
f = open(filename)
data = json.load(f)
events = data["traceEvents"]
return events
def is_gpu_compute_event(event):
global gpu_pids
return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"
def get_sorted_gpu_events(events):
sorted_gpu_events = []
for event in events:
if(not is_gpu_compute_event(event)):
continue
sorted_gpu_events.append(event)
return sorted(sorted_gpu_events, key=lambda x: x["ts"])
def get_duration(sorted_gpu_events):
if len(sorted_gpu_events) == 0:
return 0
event = sorted_gpu_events[0]
current_end_time = event["ts"] + event["dur"]
total_duration = event["dur"]
for event in sorted_gpu_events[1:]:
start_time = max(event["ts"], current_end_time)
end_time = event["ts"] + event["dur"]
total_duration = total_duration + max(end_time - start_time, 0)
current_end_time = max(current_end_time, end_time)
return total_duration
def get_sorted_gpu_mm_conv_events(events):
def is_mm_conv_event(event):
return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
or "cutlass" in event["name"] or "wgrad" in event["name"])
gpu_events = get_sorted_gpu_events(events)
sorted_events = []
for event in gpu_events:
if(not is_mm_conv_event(event)):
continue
sorted_events.append(event)
return sorted_events
gpu_pids = []
def compute_utilization(filename: str, total_length: float):
"""
Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
and percent of times spent on matmal and convolution
Args:
filename(str): Name of chrome traces file produced by pytorch profiler
total_length(float): total length of the process without profiler in second
Return:
tuple: (GPU Utilization, percent of time spent on matmal and convolution)
"""
events = get_chrome_trace_events(filename)
# get pids of GPU events
global gpu_pids
gpu_pids = []
for event in events:
if "name" not in event:
continue
if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
gpu_pids.append(event["pid"])
total_length = total_length * 1e6
sorted_gpu_events = get_sorted_gpu_events(events)
utilization = get_duration(sorted_gpu_events) / total_length
sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
return utilization, mm_conv_utilization
def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
"""
Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of
running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
It will produce a chrome trace file in trace_folder/trace_file_name.json
Example:
```
def f(a):
return a.sum()
a = torch.rand(2**20, device="cuda")
utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
```
Args:
f: function to benchmark
input: input to :attr:`f`
trace_folder: name of the folder to store the chrome trace
optimize_ctx: the context in which f will run
trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
num_runs: number of times to run f, excluding the warm-up runs, default to 1.
Return:
tuple: (GPU Utilization, percent of time spent on matmal and convolution)
"""
isExist = os.path.exists(trace_folder)
if not isExist:
os.makedirs(trace_folder)
print("create folder " + trace_folder)
if optimize_ctx is None:
optimize_ctx = NullContext()
chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
[ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)
return utilization, mm_conv_utilization