Replaces 78 assert statements across 10 files in torch.autograd with explicit if-checks raising AssertionError to prevent assertions from being disabled with Python -O flag. This ensures error checking remains active in optimized builds.
fix partially #164878
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165627
Approved by: https://github.com/albanD
This is useful for splitting grad to run in two parts while preserving intermediates:
<details>
<summary>
Click to see code
</summary>
```python
import collections
import weakref
from torch.autograd.graph import GradientEdge
def _get_grad_fn_or_grad_acc(t):
if t.requires_grad and t.grad_fn is None:
return t.view_as(t).grad_fn.next_functions[0][0]
else:
return t.grad_fn
def reverse_closure(roots, target_nodes):
# Recurse until we reach a target node
closure = set()
actual_target_nodes = set()
q: Deque = collections.deque()
for node in roots:
if node is not None and node not in closure:
closure.add(node)
q.append(node)
while q:
node = q.popleft()
reverse_edges = node.metadata.get("reverse_edges", [])
for holder_ref, idx in reverse_edges:
ref = holder_ref()
if ref is not None:
raise RuntimeError("Reverse graph is no longer alive")
fn = ref.node
if fn in closure or fn is None:
continue
if fn in target_nodes:
actual_target_nodes.add(fn)
continue
closure.add(fn)
q.append(fn)
return closure, actual_target_nodes
# Enable weak pointer
class Holder():
def __init__(self, node):
self.node = node
# TODO: use weak references to avoid reference cycle
def construct_reverse_graph(roots):
q: Deque = collections.deque()
root_seen = set()
reverse_graph_refs = []
for node in roots:
if node is not None and node not in root_seen:
q.append(node)
root_seen.add(node)
while q:
node = q.popleft()
for fn, idx in node.next_functions:
if fn is not None:
# Don't necessarily need to store on the graph
reverse_edges = fn.metadata.get("reverse_edges", [])
if len(reverse_edges) == 0:
q.append(fn)
holder = Holder(node)
holder_ref = weakref.ref(holder)
reverse_graph_refs.append(holder)
reverse_edges.append((holder_ref, idx))
fn.metadata["reverse_edges"] = reverse_edges
return reverse_graph_refs
def get_param_groups(inputs, params):
inputs_closure, _ = reverse_closure(inputs, set())
param_groups = dict() # keyed on intermediates
for i, param in enumerate(params):
closure, intersected = reverse_closure([param], inputs_closure)
param_group = {
"params": set([param]),
"intermediates": set(intersected),
}
for input_node in intersected:
existing = param_groups.get(input_node, None)
if existing is not None:
existing["params"] = existing["params"].union(param_group["params"])
existing["intermediates"] = existing["intermediates"].union(param_group["intermediates"])
param_group = existing
else:
param_groups[input_node] = param_group
# Sanity check: union of all param_groups params should be equal to all params
union_params = set()
seen_ids = set()
unique_param_groups = []
for param_group in param_groups.values():
if id(param_group) not in seen_ids:
seen_ids.add(id(param_group))
unique_param_groups.append(param_group)
union_params = union_params.union(param_group["params"])
assert union_params == set(params)
return unique_param_groups
def compute_grads_only_inputs2(roots, inps, weights):
root_grad_fns = list(map(_get_grad_fn_or_grad_acc, roots))
inp_grad_fns = list(map(_get_grad_fn_or_grad_acc, inps))
weight_grad_fns = list(map(_get_grad_fn_or_grad_acc, weights))
reverse_graph_refs = construct_reverse_graph(root_grad_fns)
param_groups = get_param_groups(inp_grad_fns, weight_grad_fns)
del reverse_graph_refs
for param_group in param_groups:
for i, intermediate in enumerate(param_group["intermediates"]):
def get_hook(param_group, i):
def hook(grad_inputs):
if param_group.get("grads", None) is None:
param_group["grads"] = [None] * len(param_group["intermediates"])
param_group["grads"][i] = grad_inputs
return hook
# These are always "split" nodes that we need to recompute, so
# save their inputs.
intermediate.register_prehook(get_hook(param_group, i))
dinputs = torch.autograd.grad((out,), inputs=tuple(inps), grad_outputs=(torch.ones_like(out),), retain_graph=True)
return dinputs, param_groups
def compute_grads_only_weights2(user_weights, param_groups):
all_dweights = dict()
for param_group in param_groups:
# TODO: Handle case where intermediate can have multiple outputs
intermediate_edges = tuple(GradientEdge(i, 0) for i in param_group["intermediates"])
weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"])
assert all(len(g) == 1 for g in param_group["grads"])
# [NEW!] Able to pass a GradientEdge to autograd.grad as output
# We do not need to retain_graph because... guarantee no overlap?
print("trying to execute: ", intermediate_edges, weights_edges)
dweights = torch.autograd.grad(intermediate_edges, weights_edges, grad_outputs=sum(param_group["grads"], tuple()))
for w, dw in zip(param_group["params"], dweights):
all_dweights[w] = dw
# return grads in the original order weights were provided in
out = []
for w in user_weights:
grad_acc = _get_grad_fn_or_grad_acc(w)
out.append(all_dweights[grad_acc])
return tuple(out)
```
</details>
```python
import torch.nn as nn
# Setup
mod1 = nn.Linear(10, 10)
mod2 = nn.Linear(10, 10)
a = torch.rand(10, requires_grad=True)
weights = tuple(mod1.parameters()) + tuple(mod2.parameters())
inps = (a,)
out = mod2(mod1(a))
class LoggingTensorMode(torch.utils._python_dispatch.TorchDispatchMode):
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
if kwargs is None:
kwargs = {}
rs = func(*args, **kwargs)
print(f"{func.__module__}.{func.__name__}")
return rs
print(" -- SPLIT -- ")
# Compute gradients in two parts
with LoggingTensorMode():
print("PART 1")
dinputs, state = compute_grads_only_inputs2((out,), inps, weights)
print("PART 2")
dweights = compute_grads_only_weights2(weights, state)
out = mod2(mod1(a))
print(" -- REF -- ")
# Compare with reference
with LoggingTensorMode():
ref_all_gradients = torch.autograd.grad(out, inputs=tuple(inps) + weights, grad_outputs=(torch.ones_like(out),))
for actual, ref in zip(dinputs + dweights, ref_all_gradients):
print(torch.allclose(actual, ref))
```
<img width="598" alt="image" src="https://github.com/pytorch/pytorch/assets/13428986/3681b8a7-3ab4-4d1d-a836-abef6913e671">
```
PART 1
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.ones_like.default
V0603 10:17:21.590878 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1ee160> with grad_outputs: [f32[10]]
torch._ops.aten.view.default
V0603 10:17:21.591204 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1ee0d0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
V0603 10:17:21.591578 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x100d7ae50> with grad_outputs: [f32[1, 10]]
torch._ops.aten.view.default
V0603 10:17:21.591747 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1e4a60> with grad_outputs: [f32[10]]
torch._ops.aten.view.default
V0603 10:17:21.591834 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1e4bb0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
V0603 10:17:21.591922 8300067520 torch/autograd/graph.py:751] Executing: <ViewBackward0 object at 0x12a1e4a90> with grad_outputs: [f32[1, 10]]
torch._ops.aten.view.default
PART 2
trying to execute: (GradientEdge(node=<AddmmBackward0 object at 0x12a1e4bb0>, output_nr=0),) (GradientEdge(node=<AccumulateGrad object at 0x12a21b130>, output_nr=0), GradientEdge(node=<AccumulateGrad object at 0x12a21b7c0>, output_nr=0))
V0603 10:17:21.592223 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1e4bb0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
torch._ops.aten.t.default
torch._ops.aten.sum.dim_IntList
torch._ops.aten.view.default
V0603 10:17:21.592421 8300067520 torch/autograd/graph.py:751] Executing: <TBackward0 object at 0x12a1cad60> with grad_outputs: [f32[10, 10]]
torch._ops.aten.t.default
trying to execute: (GradientEdge(node=<AddmmBackward0 object at 0x12a1ee0d0>, output_nr=0),) (GradientEdge(node=<AccumulateGrad object at 0x12a1e41c0>, output_nr=0), GradientEdge(node=<AccumulateGrad object at 0x12a21b670>, output_nr=0))
V0603 10:17:21.593481 8300067520 torch/autograd/graph.py:751] Executing: <AddmmBackward0 object at 0x12a1ee0d0> with grad_outputs: [f32[1, 10]]
torch._ops.aten.t.default
torch._ops.aten.mm.default
torch._ops.aten.t.default
torch._ops.aten.sum.dim_IntList
torch._ops.aten.view.default
V0603 10:17:21.593750 8300067520 torch/autograd/graph.py:751] Executing: <TBackward0 object at 0x12a21b2b0> with grad_outputs: [f32[10, 10]]
torch._ops.aten.t.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
torch._ops.aten.view.default
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/127766
Approved by: https://github.com/albanD
Use `typing_extensions.deprecated` for deprecation annotation if possible. Otherwise, add `category=FutureWarning` to `warnings.warn("message")` if the category is missing.
Note that only warnings that their messages contain `[Dd]eprecat(ed|ion)` are updated in this PR.
Resolves#126888
- #126888
This PR is split from PR #126898.
- #126898
------
Pull Request resolved: https://github.com/pytorch/pytorch/pull/127689
Approved by: https://github.com/Skylion007
Use `typing_extensions.deprecated` for deprecation annotation if possible. Otherwise, add `category=FutureWarning` to `warnings.warn("message")` if the category is missing.
Note that only warnings that their messages contain `[Dd]eprecat(ed|ion)` are updated in this PR.
UPDATE: Use `FutureWarning` instead of `DeprecationWarning`.
Resolves#126888
- #126888
Pull Request resolved: https://github.com/pytorch/pytorch/pull/126898
Approved by: https://github.com/albanD
The current call passes in `['/actual/path']` to os.walk which is a string pointing to no path and thus silently leads to and empty traversal.
There is an unused function just above that handles that, so I guess this is what was supposed to be called.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/126103
Approved by: https://github.com/suo
Thanks aakhundov for constructing the test case. This PR was constructed by running the failing test case, and then fixing problems until we got all the way to the end. There are a few distinct fixes:
* AOTAutograd performs equality tests on tensor metadata to determine if a metadata mutation had occurred. If we test i0 vs i1, we should report these are NOT equal, since obviously we have somehow resized the tensor from i0 to i1 (even if, on a particular run, it is possible i0 == i1).
* There's a sketchy fix for `test_aot_autograd_exhaustive_matmul_cpu_float32` where we check if the output shape equals the tangent shape. Unfortunately, the same `definitely_true` treatment does not work here, it still fails on the example. I piled an extra sketchy fix on top of it, where I just try my best to avoid doing the view. Maybe we should have some sort of logging here.
* Partitioner needs to get out a size for unbacked SymInt when partitioning. I just feed it a random heuristic value in this case, similar to how we've been dealing with this in Inductor.
Signed-off-by: Edward Z. Yang <ezyang@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/113159
Approved by: https://github.com/aakhundov, https://github.com/bdhirsh
This PR has a number of changes that improve subclass support for AOTAutograd/Inductor in general:
- previously if a subclass does extra aliasing between graph outputs/inputs in a way, the partitioner would complain because grad_outputs are the outputs reused as-is. Now we do a view_as(self) to workaround this.
- Use dense -> dense metadata when working with fwd_output_strides during backward. This is important since the stride information comes from inductor which sees the dense to dense graph.
- Inductor requires that the inputs to the compiled backward to match some expected strides computed during compilation. We make sure to make the inner tensors of the subclass contiguous (previously, we only made the subclass itself contiguous)
Changes specific to NestedTensor relevant to compilation:
- Properly handle the case where `__tensor_unflatten__` is passed non-symbolic dense tensors and with meta extracted from fake subclasses.
- Skip var_to_range logic for singleton int
- Skip size hint logic in inductor for singleton int
Pull Request resolved: https://github.com/pytorch/pytorch/pull/110529
Approved by: https://github.com/bdhirsh
This can be useful for advanced users (like AOTAutograd) who don't want to keep the corresponding Tensor alive (for memory reasons for example) or when inplace op will change the Tensor's grad_fn (but gradients wrt to the original value is needed).
I went minimal API change but open to suggestions.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/110867
Approved by: https://github.com/soulitzer
Fixes#102375
Sequence_nr increments in the forward pass and decrements in the backward pass. Backward ops with the same sequence_nr as a forward op represent the backward implementation for the op. The long term goal is to make this information available to the profiler so users can observe which ops are fused by the inductor openai triton kernels.
Added a test for this feature **test/dynamo/test_aot_autograd.py::AotAutogradFallbackTests::test_aot_sequence_nr**. The test case uses **aot_export_module()** to create a joint fwd/bwd fx graph. Then it walks all the nodes in fx graph using fx_graph.graph.nodes. The seq_nr of each node is recorded in node.meta. During the fwd pass the seq_nr increments and it decrements during the bwd pass. This allows the user to map forward ops to their corresponding bwd ops which is useful for performance analysis.
Expected output from the test case.
SeqNr|OrigAten|SrcFn
0|aten.convolution.default|l__self___conv1
0|aten.add.Tensor|l__self___bn1
1|aten._native_batch_norm_legit_functional.default|l__self___bn1
2|aten.relu.default|l__self___relu1
3|aten.add.Tensor|add
4|aten.view.default|flatten
5|aten.t.default|l__self___fc1
6|aten.unsqueeze.default|l__self___fc1
7|aten.mm.default|l__self___fc1
8|aten.squeeze.dim|l__self___fc1
9|aten.add.Tensor|l__self___fc1
10|aten.sub.Tensor|l__self___loss_fn
11|aten.abs.default|l__self___loss_fn
12|aten.mean.default|l__self___loss_fn
12|aten.ones_like.default|
12|aten.expand.default|
12|aten.div.Scalar|
11|aten.sgn.default|
11|aten.mul.Tensor|
8|aten.unsqueeze.default|
7|aten.t.default|
7|aten.mm.default|
7|aten.t.default|
7|aten.t.default|
7|aten.mm.default|
6|aten.squeeze.dim|
5|aten.t.default|
4|aten.view.default|
2|aten.threshold_backward.default|
1|aten.native_batch_norm_backward.default|
0|aten.convolution_backward.default|
0|aten.add.Tensor|
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103129
Approved by: https://github.com/soulitzer
Fixes#104985
Implemented `set_multithreading_enabled` C++ function to directly alter state rather than using `MultithreadingEnabled` class, which was automatically resetting the state when the object was destroyed. This behavior more closely aligns with set_grad_enabled which does work as expected. This allows us to change python class `set_multithreading_enabled` to act as both a function and context manager.
I also added a getter: `torch._C.is_multithreading_enabled`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/105291
Approved by: https://github.com/albanD
Fixes#44189
Adds a new parameter, zero_grad_unused, to the torch.autograd.grad() function. This parameter allows for the gradient to be set to 0 instead of None when a variable is unused, which can be helpful for higher-order partial differentials.
Here is an example of using this new parameter to solve d^3y/dx^3 given y = a * x:
```python
x = torch.tensor(0.5, dtype=torch.float32, requires_grad=True)
a = torch.tensor(1, dtype=torch.float32, requires_grad=True)
y = x * a
dydx = torch.autograd.grad(y, x, create_graph=True, allow_unused=True)
d2ydx2 = torch.autograd.grad(dydx, x, allow_unused=True, zero_grad_unused=True)
try:
d3ydx3 = torch.autograd.grad(d2ydx2, x, allow_unused=True, zero_grad_unused=True)
except RuntimeError as e:
assert False, "Should not raise error"
```
With `zero_grad_unused`, d2ydx2 could be 0 instead of None, enabling d3ydx3 to be calculated as defined in math without throwing an error.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/97015
Approved by: https://github.com/soulitzer
tldr; this should fix some minor perf regressions that were caused by adding more as_strided() calls in aot autograd.
This PR adds a new context manager, `torch.autograd._set_view_replay_enabled()`.
Context: AOT Autograd has special handling for "outputs that alias graph intermediates". E.g. given this function:
```
def f(x):
y = torch.mul(x, 2)
out = y.view(-1)
return out
```
AOT Autograd will do the following:
```
def fn_to_compile(x):
y = torch.mul(x, 2)
out = y.view(-1)
# return the graph intermediate
return y, out
compiled_fn = compile(fn_to_compile)
def wrapper(x):
y, out = compiled_fn(x)
# regenerate the alias of the graph intermediate
return out._view_func(y)
```
What's annoying is that `out._view_func()` will result in a `.as_strided` call, because `out` is an ordinary runtime tensor. This (likely?) caused a perf regression, because when running the backward, out `as_strided_backward()` is slower than our `view_backward()`.
In this PR, I added some TLS for instructing autograd to do view replay instead of as_strided, even when given a normal tensor. I'm definitely interested in thoughts from autograd folks (cc @albanD @soulitzer). A few points that I want to bring up:
(1) One reason that this API seems generally useful to me is because of the case where you `torch.compile()` a function, and you pass in two inputs that alias each other, and mutate one of the inputs. Autograd is forced to add a bunch of as_strided() calls into the graph when this happens, but this would give users an escape hatch for better compiled perf in this situation
(2) To be fair, AOT Autograd probably won't need this TLS in the long term. There's a better (more complicated) solution, where AOT Autograd manually precomputes the view chain off of graph intermediates during tracing, and re-applies them at runtime. This is kind of complicated though and feels lower priority to implement immediately.
(3) Given all of that I made the API private, but lmk what you all think.
This is a followup of https://github.com/pytorch/pytorch/pull/92255.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/92588
Approved by: https://github.com/ezyang, https://github.com/albanD
Monkeypatching is bad, we should never be doing it. This PR removes
functorch's monkeypatching on Tensor.backward() by adding it directly to
the implementation of Tensor.backward().
As an alternative, we could have done an `import functorch` and used
`functorch._C.are_transforms_active` directly in
`torch/autograd/__init__.py`. The problem with that is that it runs into a
bunch of circular imports.
NB: https://github.com/pytorch/pytorch/issues/72179 is still on my mind.
I didn't choose to do it right now because:
- This PR doesn't make the situation worse than it already is (no
monkeypatching is better than having the monkeypatch)
- We don't have a design for #72179 yet.
Test Plan:
- tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85152
Approved by: https://github.com/soulitzer
crossref is a new strategy for performing tests when you want
to run a normal PyTorch API call, separately run some variation of
the API call (e.g., same thing but all the arguments are meta tensors)
and then cross-reference the results to see that they are consistent.
Any logic you add to CrossRefMode will get run on *every* PyTorch API
call that is called in the course of PyTorch's test suite. This can
be a good choice for correctness testing if OpInfo testing is not
exhaustive enough.
For now, the crossref test doesn't do anything except verify that
we can validly push a mode onto the torch function mode stack for all
functions.
Signed-off-by: Edward Z. Yang <ezyangfb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/75988
Approved by: https://github.com/seemethere
I figured these out by unconditionally turning on a no-op torch function
mode on the test suite and then fixing errors as they showed up. Here's
what I found:
- _parse_to failed internal assert when __torch_function__'ed because it
claims its name is "to" to the argument parser; added a name override
so we know how to find the correct name
- Infix operator magic methods on Tensor did not uniformly handle
__torch_function__ and TypeError to NotImplemented. Now, we always
do the __torch_function__ handling in
_wrap_type_error_to_not_implemented and your implementation of
__torch_function__ gets its TypeErrors converted to NotImplemented
(for better or for worse; see
https://github.com/pytorch/pytorch/issues/75462 )
- A few cases where code was incorrectly testing if a Tensor was
Tensor-like in the wrong way, now use is_tensor_like (in grad
and in distributions). Also update docs for has_torch_function to
push people to use is_tensor_like.
- is_grads_batched was dropped from grad in handle_torch_function, now
fixed
- Report that you have a torch function even if torch function is
disabled if a mode is enabled. This makes it possible for a mode
to return NotImplemented, pass to a subclass which does some
processing and then pass back to the mode even after the subclass
disables __torch_function__ (so the tensors are treated "as if"
they are regular Tensors). This brings the C++ handling behavior
in line with the Python behavior.
- Make the Python implementation of overloaded types computation match
the C++ version: when torch function is disabled, there are no
overloaded types (because they all report they are not overloaded).
Signed-off-by: Edward Z. Yang <ezyangfb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/75484
Approved by: https://github.com/zou3519
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72292
Integrates the libkineto step() method into pytorch profiler step() invocation.
This enables Kineto to track the iteration count and trigger trace collection on iteration boundaries from outside the process.
Test Plan:
## Test using pytorch profiler step() method
Modified the resnet integration test to use pytorch profiler.
Configure it to capture 3 iterations :
```
ACTIVITIES_COMPRESSION_ALGORITHM=GZIP
ACTIVITIES_MANIFOLD_PATH=gpu_traces/tree/traces/dynocli/0/1643063194/127.0.0.1/
PROFILE_START_ITERATION=200
ACTIVITIES_WARMUP_ITERATIONS=1
ACTIVITIES_ITERATIONS=3
```
Run
dyno gputrace -gpuconf /tmp/kineto_pytorch.conf
The output trace has iterations 202, 203, 204 :) One iteration is skipped due to warmup. (Also its one off due 0 vs 1 indexing)
[Trace link](https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree%2Ftraces%2Fdynocli%2F0%2F1643063194%2F127.0.0.1%2Flibkineto_activities_501743.json.gz&bucket=gpu_traces)
{F695716262}
Reviewed By: robieta
Differential Revision: D33825241
fbshipit-source-id: 70983420cf47ebbac7b44bfb6494d314506302c5
(cherry picked from commit 96c06ecc9a80512d85b8941f195360f41d74103f)
Summary:
When default hooks are set, they are pushed onto a stack.
When nesting context-manager, only the inner-most hooks will
be applied.
There is special care needed to update the TLS code. See also https://github.com/pytorch/pytorch/issues/70940 (i.e. do we need to be storing the enabled flag as well?)
Fixes https://github.com/pytorch/pytorch/issues/70134
Pull Request resolved: https://github.com/pytorch/pytorch/pull/70932
Reviewed By: mruberry
Differential Revision: D33530370
Pulled By: albanD
fbshipit-source-id: 3197d585d77563f36c175d3949115a0776b309f4
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65736
We ran into some limitations to extract PyTorch operator parameters through hooks or the execution graph. Some of these limitations are not due to the operator not exposing them, rather the inputs for these operators are already fused/processed in some cases (like embedding table). We want to be able to attach some metadata to the user scope record functions allowing the profilers to later extract these information.
The record function C++ API already supports taking inputs and outputs information. The corresponding Python interface does not support them and only allows a string name as record function parameter.
This diff adds support for user to optionally to add additional arguments to the record function in two ways.
1. to remain backward compatible with `record_function_op`, we have added an optional string arg to the interface: `with record_function(name, arg_str)`.
2. to support data dependency graph, we also have the new `torch.autograd._record_function_with_args_enter` and `torch.autograd._record_function_with_args_exit` functions to provide an interface where we can give additional tensor arguments. For now we imagine this can be used for debugging or analysis purpose. In this form, we currently support some basic data types as inputs: scalars, string, list, and tensor.
Example usage:
```
# record_function operator with a name and optionally, a string for arguments.
with record_function("## TEST 1 ##", "[1, 2, 3]"):
<actual module or operator>
# more general form of record_function
a = _record_function_with_args_enter("## TEST 2 ##", 1, False, 2.5, [u, u], "hello", u)
<actual module or operator>
_record_function_with_args_exit(a)
```
Corresponding outputs in execution graph:
```
{
"name": "## TEST 2 ##", "id": 7, "parent": 3, "fw_parent": 0, "scope": 5, "tid": 1, "fw_tid": 0,
"inputs": [1,false,2.5,[6,6],"hello",6], "input_shapes": [[],[],[],[[3,4,5],[3,4,5]],[],[3,4,5]], "input_types": ["Int","Bool","Double","GenericList[Tensor(float),Tensor(float)]","String","Tensor(float)"],
"outputs": [], "output_shapes": [], "output_types": []
},
{
"name": "## TEST 1 ##", "id": 3, "parent": 2, "fw_parent": 0, "scope": 5, "tid": 1, "fw_tid": 0,
"inputs": ["1, 2, 3"], "input_shapes": [[]], "input_types": ["String"],
"outputs": [], "output_shapes": [], "output_types": []
},
```
Test Plan:
```
=> buck build caffe2/test:profiler --show-output
=> buck-out/gen/caffe2/test/profiler#binary.par test_profiler.TestRecordFunction
test_record_function (test_profiler.TestRecordFunction) ... Log file: /tmp/libkineto_activities_1651304.json
Net filter:
Target net for iteration count:
Net Iterations: 3
INFO:2021-09-27 01:10:15 1651304:1651304 Config.cpp:424] Trace start time: 2021-09-27 01:10:30
Trace duration: 500ms
Warmup duration: 5s
Net size threshold: 0
GPU op count threshold: 0
Max GPU buffer size: 128MB
Enabled activities: cpu_op,user_annotation,external_correlation,cuda_runtime,cpu_instant_event
Manifold bucket: gpu_traces
Manifold object: tree/traces/clientAPI/0/1632730215/devvm2060.ftw0/libkineto_activities_1651304.json
Trace compression enabled: 1
INFO:2021-09-27 01:10:15 1651304:1651304 ActivityProfiler.cpp:536] Tracing starting in 14s
INFO:2021-09-27 01:10:15 1651304:1651304 ActivityProfiler.cpp:48] Target net for iterations not specified - picking first encountered that passes net filter
INFO:2021-09-27 01:10:15 1651304:1651304 ActivityProfiler.cpp:57] Tracking net PyTorch Profiler for 3 iterations
INFO:2021-09-27 01:10:15 1651304:1651304 ActivityProfiler.cpp:126] Processing 1 CPU buffers
INFO:2021-09-27 01:10:15 1651304:1651304 ActivityProfiler.cpp:686] Recorded nets:
INFO:2021-09-27 01:10:15 1651304:1651304 ActivityProfiler.cpp:689] PyTorch Profiler: 1 iterations
ok
----------------------------------------------------------------------
Ran 1 test in 0.021s
OK
```
Reviewed By: gdankel
Differential Revision: D31165259
fbshipit-source-id: 15920aaef7138c666e5eca2a71c3bf33073eadc4
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65564
- wrap the call into engine with vmap if `batched_grad` is `True`
- improves the comment on the call to engine (somewhat addressing https://github.com/pytorch/pytorch/issues/41659)
- borrows the message from functional.jacobian's vectorized argument concerning usage of the vmap feature
- adds basic test (further testing is done when we replace the usage in vectorized jacobian computation)
TODO:
- create an issue tracking this
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31236259
Pulled By: soulitzer
fbshipit-source-id: b33e6b26ea98fa9f70c44da08458fc54ba4df0f7