[BE][2/16] fix typos in torch/ (torch/_*/) (#156312)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156312
Approved by: https://github.com/albanD
This commit is contained in:
Xuehai Pan 2025-07-12 13:12:13 +08:00 committed by PyTorch MergeBot
parent e90148c91d
commit 7f14b42adf
70 changed files with 123 additions and 123 deletions

View File

@ -1169,7 +1169,6 @@ exclude_patterns = [
'aten/src/ATen/[a-mA-M]*/**',
'test/**',
'test/[a-hA-h]*/**',
'torch/_*/**',
'torch/distributed/tensor/**',
]
init_command = [

View File

@ -376,7 +376,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder {
// for more info.
// This is the type we will use to marshall information on disk during
// ser/de. It is a simple tuple composed of primitive types and simple
// Ser/De. It is a simple tuple composed of primitive types and simple
// collection types like vector, optional, and dict.
using SerializationType = std::tuple<
std::vector<std::string> /*input_names_*/,

View File

@ -487,7 +487,7 @@ class TestExport(TestCase):
eps = [ep]
if test_serdes:
# test dynamic shapes serialization
# test that behavior remains the same when exporting with ser/des specs:
# test that behavior remains the same when exporting with Ser/Des specs:
# serialize + deserialize original specs, and export.
ep_serdes = export(
model,
@ -927,7 +927,7 @@ graph():
ep = export(f, args, strict=False)
self.assertEqual(ep.module()(*args), f(*args))
@testing.expectedFailureCppSerDes # Cpp serder seems to fail parsing complicated guards
@testing.expectedFailureCppSerDes # Cpp Ser/Der seems to fail parsing complicated guards
def test_export_statically_known_true(self):
class Foo(torch.nn.Module):
def forward(self, x, y):
@ -5011,7 +5011,7 @@ def forward(self, p_linear_weight, p_linear_bias, b_buffer, x):
# There should be nonzero view nodes in the graph
self.assertTrue(view_count > 0)
@testing.expectedFailureCppSerDes # cpp ser/der not handling complicated symbols
@testing.expectedFailureCppSerDes # cpp Ser/Der not handling complicated symbols
def test_solver_unsupported_sympy_function(self):
# repro of https://github.com/pytorch/pytorch/issues/131897

View File

@ -954,7 +954,7 @@ class TestFX(JitTestCase):
script_out = scripted_lowered(x)
torch.testing.assert_close(script_out, ref_out)
# Test TorchScript ser/de
# Test TorchScript Ser/De
import_copy = self.getExportImportCopy(scripted_lowered)
imported_out = import_copy(x)
torch.testing.assert_close(imported_out, ref_out)

View File

@ -1104,7 +1104,7 @@ class TestUnaryUfuncs(TestCase):
self.assertEqual(res.real, out.real, atol=atol, rtol=rtol)
self.assertEqual(res.imag, out.imag, atol=atol, rtol=rtol)
# It is not obvious how to merge this into OpInfo becuase these inputs
# It is not obvious how to merge this into OpInfo because these inputs
# succeed for gradcheck but are expected to fail for gradgradcheck
@dtypes(torch.double)
def test_sinc(self, device, dtype):

View File

@ -36,6 +36,7 @@ rebuilt
reenable
reenabled
requestor
ser
serde
serder
serdes

View File

@ -52,7 +52,7 @@ class Reduction(Enum):
# This wraps a decomposition and performs various type promotion logic within it, depending on the strategy provided
# We're currently re-using ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops
# We're currently reusing ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops
# Will need to validate the non-elementwise uses
def type_casts(
f: Callable,
@ -947,7 +947,7 @@ def im2col(
)
torch._check(
all(c > 0 for c in output_size),
lambda: f"Given an input with spacial size {tuple(shape[-2:])}, "
lambda: f"Given an input with spatial size {tuple(shape[-2:])}, "
f"kernel_size={kernel_size}, dilation={dilation}, "
f"padding={padding}, stride={stride}, "
"the calculated shape of the array of sliding blocks "
@ -4046,7 +4046,7 @@ def nll_loss2d_forward(
return _nll_loss_forward(self, target, weight, reduction, ignore_index)
# These are adapted from aten/src/ATen/native/UpSample.h, wich is based on
# These are adapted from aten/src/ATen/native/UpSample.h, which is based on
# https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
def _upsample_cubic_convolution1(x: Tensor, A: float) -> Tensor:
return ((A + 2) * x - (A + 3)) * x * x + 1

View File

@ -1063,7 +1063,7 @@ def _compile(
return f"'{code.co_name}' ({code.co_filename}:{code.co_firstlineno})"
# NS: Don't add period at the end of string, as it'll be added to URL
# renderring it incorrect
# rendering it incorrect
log.warning(
"torch._dynamo hit config.%s (%s)\n"
" function: %s\n"

View File

@ -347,7 +347,7 @@ class StackLocalsMetadata:
def get_builtins_dict(global_scope):
# f_globals["__builtins__"] can be a dict or a module. This is an
# implemenation detail -
# implementation detail -
# https://docs.python.org/3/library/builtins.html.
# This makes guarding on any builtin messy because the guard check_fn

View File

@ -1662,13 +1662,13 @@ class VariableBuilder:
# <==> variable tracker" 1-to-1 mapping, which is mainly handled via
# `side_effects`. Note that constructing `tensor_variable` above
# already adds it to graph arg, but we never registered it with
# `side_effects`. The pre-emptive `realize` calls here basically
# `side_effects`. The preemptive `realize` calls here basically
# does that registration (at the end of `self.__call__`).
#
# A slightly cleaner alternative is to register the
# `tensor_variable`s above with `side_effects` directly, and just
# return the `list_variable`, but that breaks some tensor-subclass
# releated tests like `test_inputs_aliasing_bytecode_stack_restore`,
# related tests like `test_inputs_aliasing_bytecode_stack_restore`,
# because `tensor_variable` is constructed via
# `handle_traced_output`, which doesn't really expect/handle tensor
# subclass.

View File

@ -134,7 +134,7 @@ def execute_subgraph_from_prim_loop(
):
"""
subgraph: GraphModule from sub-block.
iter_idx: The index of interation.
iter_idx: The index of interaction.
len_loop_local_arguments: The number of loop local arguments in args.
"""
@ -810,7 +810,7 @@ class TS2FXGraphConverter:
fx_node = self.fx_graph.call_function(target, args, kwargs)
# TODO: covnert sourceRange() into stack_trace
# TODO: convert sourceRange() into stack_trace
# fx_node.meta["stack_trace"] = node.sourceRange()
if node.outputsSize() == 1:
@ -883,7 +883,7 @@ class TS2FXGraphConverter:
torch.ops.aten._local_scalar_dense.default, (to_copy_node,)
)
# TODO: covnert sourceRange() into stack_trace
# TODO: convert sourceRange() into stack_trace
# fx_node.meta["stack_trace"] = node.sourceRange()
output_name = node.output().debugName()
@ -942,7 +942,7 @@ class TS2FXGraphConverter:
kwargs,
)
# TODO: covnert sourceRange() into stack_trace
# TODO: convert sourceRange() into stack_trace
# fx_node.meta["stack_trace"] = node.sourceRange()
output_name = node.output().debugName()
@ -1006,7 +1006,7 @@ class TS2FXGraphConverter:
):
target = torch.ops.aten.add.t
else:
raise RuntimeError(f"unable to determind the target for {node}")
raise RuntimeError(f"unable to determined the target for {node}")
else:
target = get_op_overload(node)
@ -1565,7 +1565,7 @@ DEBUG: (TORCH_LOGS="+export" <cmd>), additionally
#
# This function should happen in TS2EPConverter instead of
# TS2FXGraphConverter since it gets attributes from self.ts_model
# which is not accessable in TS2FXGraphConverter. It is similar to where
# which is not accessible in TS2FXGraphConverter. It is similar to where
# we collect self.name_to_param and self.name_to_buffer.
name_to_attribute_fqn: dict[str, str] = {}

View File

@ -165,7 +165,7 @@ def lift_constants_pass(
constant_attrs: ConstantAttrMap,
) -> dict[str, _ConstantAttributeType]:
"""
Takes a graph module, graph signature, and modifies them implace to lift any
Takes a graph module, graph signature, and modifies them inplace to lift any
constants (tensors or custom classes) as inputs to the graph. Returns a
dictionary of names to constants.

View File

@ -100,8 +100,8 @@ def _split_autocast(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
split_autocast creates a new graph module that splits the input graph module into multiple submodules
based on the `_enter_autocast` and `_exit_autocast` nodes. It doesn't mutate the input graph module.
Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are splitted
into a submodule. Nested autocast regions are not splitted.
Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are split
into a submodule. Nested autocast regions are not split.
`_enter_autocast` and `_exit_autocast(_enter_autocast)` nodes are in the submodule as well.
Below is an example of splitting. A, B, C, D, E are blocks of non-autocast nodes in the original graph

View File

@ -292,7 +292,7 @@ def _conv1d_op_with_squeeze(
def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
"""Conv specfic transformation function."""
"""Conv specific transformation function."""
assert isinstance(node.target, torch._ops.OpOverload)
opname = node.target._opname
scale_node, zero_point_node = node.args[2], node.args[3]
@ -347,7 +347,7 @@ def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.No
def _transform_linear_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
"""Linear specfic transformation function."""
"""Linear specific transformation function."""
scale_node, zero_point_node = node.args[2], node.args[3]
inp_node, param_node = node.args[0], node.args[1]

View File

@ -46,7 +46,7 @@ def _replace_with_hop_helper(
enter_block_node.meta.get("nn_module_stack", {})
)
output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
# Split_module pass intentially doesn't add output node
# Split_module pass intentionally doesn't add output node
# if the graph doesn't return anything.
# TODO (tmanlaibaatar) Figure out if this is right behaviour
# for split_module
@ -97,7 +97,7 @@ def _replace_with_hop_helper(
node_replace_(node, get_item_node)
else:
raise NotImplementedError(
f"repalce_with_hop_pass doesnt' support output type {type(output_args)}"
f"replace_with_hop_pass doesn't support output type {type(output_args)}"
)
else:
# TODO (shangdiy): remove this line, since the export graph can be non-functional

View File

@ -382,7 +382,7 @@ class ModuleCallSignature:
out_spec: Annotated[str, 40]
# This field is used to prettify the graph placeholders
# after we ser/der and retrace
# after we Ser/Der and retrace
forward_arg_names: Annotated[Optional[list[str]], 50] = None
@ -413,7 +413,7 @@ class GraphModule:
# Invariant: Every time a change is made to the schema, one of the versions
# should be upadted.
# should be updated.
@dataclass
class SchemaVersion:
major: Annotated[

View File

@ -689,7 +689,7 @@ def check(commit: _Commit, force_unsafe: bool = False):
for f, d in fields.items():
if kind == "struct" and "default" not in d:
reason += (
f"Field {k}.{f} is added to schema.py without a default value as an incomparible change "
f"Field {k}.{f} is added to schema.py without a default value as an incompatible change "
+ "which requires major version bump.\n"
)
next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1]

View File

@ -1408,7 +1408,7 @@ class GraphModuleSerializer(metaclass=Final):
assert isinstance(
return_schema.real_type, (torch.OptionalType, torch.TensorType)
)
# When the return type is annoated as Tensor type, the op can also return an
# When the return type is annotated as Tensor type, the op can also return an
# undefined Tensor which will be implicitly converted to None in Python.
output_arguments.append(Argument.create(as_none=True))
elif isinstance(meta, FakeTensor):
@ -2057,7 +2057,7 @@ class GraphModuleDeserializer(metaclass=Final):
_additional_msg = (
(
f"We failed to resolve {target} to an operator. "
+ "If it's a custom op/custom triton op, this is usally because the custom op is not registered"
+ "If it's a custom op/custom triton op, this is usually because the custom op is not registered"
+ " when deserializing. Please import the custom op to register it before deserializing."
+ " Otherwise, please file an issue on github."
)

View File

@ -41,7 +41,7 @@ def _get_field_names(cls) -> set[str]:
# this decorator to configure it. It's safe, faster and allows code sharing.
#
# For example, _union_dataclass customizes the __eq__ method to only check the type
# and value property instead of default implmentation of dataclass which goes
# and value property instead of default implementation of dataclass which goes
# through every field in the dataclass.
@dataclass_transform(eq_default=False)
def _union_dataclass(cls: type[T]) -> type[T]:

View File

@ -1269,7 +1269,7 @@ def _collect_all_valid_cia_ops() -> set["OperatorBase"]:
def _get_decomp_for_cia(op: "OperatorBase"):
# [NOTE] Seperating out func.decompose
# [NOTE] Separating out func.decompose
# Ideally we should be able to just register func.decompose but
# we can't as this decomp is gonna be registered to the py_impl.
# As a result it will infinitely recurse. So we first check if the op

View File

@ -279,7 +279,7 @@ def check_cacheable(gm: torch.fx.GraphModule):
# Subgraphs are only used for caching logic.
if hasattr(gm, "saved_tensors_hooks_pack_0"):
check_cacheable(gm.saved_tensors_hooks_pack_0) # type: ignore[arg-type]
# We have guarantee of unpack sugraph existance if pack subgraph exists
# We have guarantee of unpack sugraph existence if pack subgraph exists
check_cacheable(gm.saved_tensors_hooks_unpack_0) # type: ignore[arg-type]

View File

@ -61,7 +61,7 @@ static_input_logger = getArtifactLogger("torch._dynamo", "cudagraph_static_input
# We assume tangents memory format to be similar to corresponding output's memory_format.
# The idea is that we are technically making a guess about the strides of our tangents,
# while we trace out the joint.
# If runtime specfied tangents will not have the same memory format as predicted traced tangents,
# If runtime specified tangents will not have the same memory format as predicted traced tangents,
# we coerce them at runtime to traced tangents memory format.
@ -83,7 +83,7 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
out = out.contiguous(memory_format=memory_format.memory_format)
updated = was is not out
# For subclass we keep memory format of outer strides at the beggining of the list
# For subclass we keep memory format of outer strides at the beginning of the list
out_memory_format = [memory_format] if is_subclass else memory_format
# Note [Tangents memory format, Part 2]
@ -583,7 +583,7 @@ from a multi-output view call"
and not o.requires_grad
):
# In theory we could use any of these tensors to regenerate the aliased outputs from,
# since they all alias each other and have identical metatadata
# since they all alias each other and have identical metadata
out_alias = outs_with_identical_metadata_that_require_grad[0]
existing_out_idx = out_tensor_ids[id(out_alias)]
output_type = OutputType.alias_of_intermediate_base_is_user_output
@ -702,7 +702,7 @@ from a multi-output view call"
# (a * b).sum().backward()
#
# We can not deduce it easily now, so introducing a debug config to be able to turn off this for specific cases.
# NJT gurantees to have its tangent as NJT, because it has dedicated integration in Autograd
# NJT guarantees to have its tangent as NJT, because it has dedicated integration in Autograd
# See torch/csrc/autograd/python_function.cpp, use_zeros_like.
(
_plain_fake_tensor_like_subclass(inp)

View File

@ -371,7 +371,7 @@ class FunctionalTensorMetadataEq:
if other is None:
return True
# Comparison agains any other type is not implemented.
# Comparison against any other type is not implemented.
if not isinstance(other, FunctionalTensorMetadataEq):
return NotImplemented

View File

@ -1048,7 +1048,7 @@ def maybe_inline_graph_saved_tensors_hooks(
fw_outs_bw_ins_node_names.append(new_node_name)
else:
# We can not specify desired name in node_copy.
# Copying node manually to set specifc name,
# Copying node manually to set specific name,
# to have matching fw_outs, bw_inputs names.
new_node_name = _gen_unused_name(f"{saved.name}_hook_{out_idx}")
with fw_g.inserting_before(_n):
@ -1458,7 +1458,7 @@ def aot_dispatch_autograd(
# It's possible to construct a case where eager may or may not have have tried to autograd through y,
# depending on the actual grad_outputs that were passed in during the backward.
# There is no easy fix for this: the simplest fix would be to run with `retain_graph=True`,
# allowing autograd to re-use the graph.
# allowing autograd to reuse the graph.
#
# An example of this case is:
# def f(x):

View File

@ -1440,7 +1440,7 @@ def merge_view_inputs(
# to have incorrect sizes.
example_idx = aliased_input_indices[0]
example_alias = fwd_inputs[example_idx]
# Note that this function is re-used at both trace time and runtime.
# Note that this function is reused at both trace time and runtime.
# At trace time, we're under a FakeMode so synthetic_base becomes a FakeTensor.
synthetic_base = torch.empty(
(0,), dtype=example_alias.dtype, device=example_alias.device
@ -1519,7 +1519,7 @@ def merge_view_inputs(
# unless we suspect that inductor might specialize and insert additional guards. When we do lazy
# lowering, we stash the AOT backward graph (bw_module) in this class.
#
# Lowering passes are performed on a deepcopy of this bw_module due to compatbility
# Lowering passes are performed on a deepcopy of this bw_module due to compatibility
# with compiled autograd. See: https://github.com/pytorch/pytorch/pull/149229#discussion_r2002122645.
@dataclass
class AutogradLazyBackwardCompileInfo:
@ -1842,7 +1842,7 @@ def coerce_to_expected_memory_format(x: torch.Tensor, memory_format: MemoryForma
return x
# Empty_strided creates a raw Tensor.
# We are guranteed that only raw Tensors has expected size and stride.
# We are guaranteed that only raw Tensors has expected size and stride.
# Subclasses have only expected memory_format.
restrided = torch.empty_strided(
size=expected_size,

View File

@ -224,7 +224,7 @@ class SubclassCreationMeta:
# arg_count is inclusive of the arg_counts of any
# inner tensor subclasses: If I have a TwoTensor and
# both of its inner elements are TwoTensors, then the
# arg_count of the outer-most sublass will be 4
# arg_count of the outer-most subclass will be 4
arg_count: int
# Mark where or not symints were included. This flag is only used in one assertion
# in "wrap_tensor_subclasses"
@ -384,7 +384,7 @@ class ViewAndMutationMeta:
# metadata pass of the user's forward function.
# Their only use today is to pass them as a best-guess for tangents when tracing the joint.
# Stashing them as part of our "metadata" makes it simpler if we want to run our analysis
# pass once, and re-use the output throughout AOTAutograd
# pass once, and reuse the output throughout AOTAutograd
traced_tangents: list[Any]
# Each of these is a list telling us about subclasses for the inputs/outputs/grad_outs

View File

@ -370,7 +370,7 @@ def wrap_tensor_subclasses(
# we computed subclass metadata on every forward output, but this did **not** include activations
# created by the partitioner.
# as a result, `unwrapped_args` here will correspond to (*unwrapped_user_fw_outs, *activations),
# but `subclass_metas` will only correspond to subclass metatadata on `user_fw_outs`.
# but `subclass_metas` will only correspond to subclass metadata on `user_fw_outs`.
# We then need to make sure that we return (*wrapped_user_fw_outs, *activations).
if num_fw_outs_saved_for_bw is not None:
assert len(unwrapped_args) == num_args_tallied + num_fw_outs_saved_for_bw, (
@ -396,7 +396,7 @@ def wrap_tensor_subclasses(
def wrap_tensor_subclasses_maybe_joint(
unwrapped_args, *, is_joint_structure: bool, meta: ViewAndMutationMeta
) -> Union[tuple[Any, ...], list[Any]]:
# Since this function is re-used for both inference and joint graphs,
# Since this function is reused for both inference and joint graphs,
if is_joint_structure:
assert isinstance(unwrapped_args, tuple) and len(unwrapped_args) == 2
assert isinstance(unwrapped_args[0], (tuple, list)) and isinstance(

View File

@ -365,7 +365,7 @@ AOT_COUNTER = itertools.count()
#
# We view every forward output when creating out tangent tensors to handle the problematic
# case in which a subclass does extra aliasing between graph outputs/inputs in a way that
# is not visible above the sublass.
# is not visible above the subclass.
#
# Ordinarily, when constructing the joint function that we want to trace in AOTAutograd,
# we're guaranteed that the tangent tensors that we pass
@ -872,7 +872,7 @@ def aot_function(
This API is experimental and likely to change.
Args:
fn (Callable): A Python function that takes one ore more arguments. Must
fn (Callable): A Python function that takes one or more arguments. Must
return one or more Tensors.
fw_compiler (Callable): A Python function that accepts an Fx graph with
Aten ops and input args, and returns a Callable that semantically is
@ -1260,7 +1260,7 @@ def aot_export_module(
# Your module can return multiple outputs, so you must specify which output the loss is.
output_loss_index: Optional[int] = None,
pre_dispatch: bool = False,
# If None, will be infered from inputs and mod.graph.nodes if mod is a graph module, but the inferred result might be wrong.
# If None, will be inferred from inputs and mod.graph.nodes if mod is a graph module, but the inferred result might be wrong.
dynamic_shapes: Optional[bool] = None,
kwargs=None,
) -> tuple[torch.fx.GraphModule, GraphSignature]:
@ -1459,7 +1459,7 @@ def aot_export_joint_simple(
*,
trace_joint: bool,
# It looks like the main consequence of this API is that for dynamic shapes,
# it will assume that parms/buffers are static.
# it will assume that params/buffers are static.
# With the new inferred dynamic shapes API, maybe this doesn't matter?
num_params_buffers: int = 0,
decompositions: Optional[dict] = None,
@ -1570,7 +1570,7 @@ def _aot_export_function(
# We don't know this info at trace time though, so we need to make it an explicit config.
no_tangents: bool = False,
pre_dispatch: bool = False,
# If None, `dynamic_shapes` will be infered from inputs, but the inferred result might be wrong.
# If None, `dynamic_shapes` will be inferred from inputs, but the inferred result might be wrong.
dynamic_shapes: Optional[bool] = None,
keep_input_mutations: bool = False,
kwargs=None,

View File

@ -179,7 +179,7 @@ def raise_getitems(gm: fx.GraphModule) -> fx.GraphModule:
)
# loop through getitem nodes in the graph and raise them to the parent node
# in reverse order to perserve their original relative order
# in reverse order to preserve their original relative order
for node in reversed(getitem_nodes):
assert len(node.all_input_nodes) == 1
parent = node.all_input_nodes[0]

View File

@ -31,7 +31,7 @@ from .partitioners import (
log = logging.getLogger(__name__)
# These canonicalizations are needed here (and not decompositions), as the ops
# These canonicalization are needed here (and not decompositions), as the ops
# we're trying to canonicalize to CompositeImplicitAutograd.
def _canonicalize(fx_g):
for node in fx_g.graph.find_nodes(
@ -249,7 +249,7 @@ def memory_efficient_fusion(
Args:
fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
that takes one ore more arguments. Must return one or more Tensors.
that takes one or more arguments. Must return one or more Tensors.
**kwargs: Any other overrides you want to make to the settings
Returns:

View File

@ -292,7 +292,7 @@ strict_autograd_cache = False
# which can reorder or ,delete duplicate nodes in the graph
# - If any of these passes reorder/delete/duplicate a collective
# in a setting where the compiler is being run independently on multiple
# ranks, we run the risk that the compiler will make a different decison on
# ranks, we run the risk that the compiler will make a different decision on
# different ranks, resulting in a NCCL hang when using torch.compile
# To handle this, we will (by default) ensure that collectives are not modified
# by the compiler.

View File

@ -513,7 +513,7 @@ def should_quantize(node: torch.fx.Node) -> bool:
].get("skip_dynamo_guards", False):
return size_in_mb >= size_threshold
else:
# case 1: we alway quantize tensors with dynamic shapes
# case 1: we always quantize tensors with dynamic shapes
if torch._inductor.config.post_grad_fusion_options[
"activation_quantization_aten_pass"
].get("quantize_dynamic_shape", False):
@ -521,7 +521,7 @@ def should_quantize(node: torch.fx.Node) -> bool:
size_in_mb >= size_threshold
) or not statically_known_false(size_in_mb >= size_threshold)
else:
# case 2: we alway not quantize tensors with dynamic shapes
# case 2: we always not quantize tensors with dynamic shapes
return statically_known_true(size_in_mb >= size_threshold)
@ -592,7 +592,7 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
output_updated_args = [
node_to_quant[node] if node in node_to_quant else node for node in fwd_outputs
]
# add the scale nodes to the ouput find the first sym_node in the output
# add the scale nodes to the output find the first sym_node in the output
idx = find_first_sym_node(output_updated_args)
scale_nodes = tensor_scale_nodes + sym_scale_nodes
if scale_nodes:
@ -1094,7 +1094,7 @@ def reordering_to_mimic_autograd_engine(gm: fx.GraphModule) -> fx.GraphModule:
"""
This pass finds the first bwd node in the graph (by looking at users of
tangents) and then reorders the graph by walking from this node to all the
way to the end of the graph. At each op in this traveral, we insert this op
way to the end of the graph. At each op in this traversal, we insert this op
in a new graph and try to bring only the relevant subgraph from the other
non-bwd edges relevant for this op. This closely mimics the behavior of
autograd engine.
@ -1364,7 +1364,7 @@ def functionalize_rng_ops(
get_device(node_pair["fwd"]) for node_pair in recomputable_rng_ops_map.values()
)
devices.discard(torch.device("cpu"))
# multiple cuda devices wont work with cudagraphs anyway,
# multiple cuda devices won't work with cudagraphs anyway,
# fallback to non graphsafe rng checkpointing
multi_cuda_devices = len(devices) > 1

View File

@ -586,7 +586,7 @@ class FunctionalCallableWithEpilogue:
def __call__(self, *args, **kwargs):
# We call torch.func.functionalize. This allows us to inline the epilogue graph.
# Inlining has the benefit of allowing easiser fusion inside subgraph.
# Though the epilogue graph contains copy_, it is OK becuase inductor can handle it
# Though the epilogue graph contains copy_, it is OK because inductor can handle it
# and this is also how we have been supporting top-level graph input mutation.
return tuple(torch.func.functionalize(self.orig_callable)(*args, **kwargs))
@ -944,7 +944,7 @@ def auto_functionalized_v2_proxy(
# Below code materializes the callable inputs to the hop as graph modules.
# kwargs may contain general callables, that are not proxable e.g. FunctionWithNoFreeVars
# this could happen when we auto_functionalize the backward of the hop,
# where backward fn is a callablle that wrapps forward graph module.
# where backward fn is a callablle that wraps forward graph module.
# This function materialize the callable args according to the schema of the hop.
# We cannot materialize the callables in kwargs directly because the inputs to callable

View File

@ -198,7 +198,7 @@ class BaseHOP(HigherOrderOperator, abc.ABC):
import warnings
warnings.warn(
"Aliasing is not suppported for HOP subgraph.\n"
"Aliasing is not supported for HOP subgraph.\n"
f"{subgraph.print_readable(print_output=False)}\n"
f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}"
f"This may lead to silent incorrectness."

View File

@ -348,7 +348,7 @@ class CondAutogradOp(torch.autograd.Function):
operands = saved_tensors_and_symints(ctx)
args = operands + flat_grads
# TODO: we need to materialize the bw graphs because dynamo is unable to
# trace through the joint funcion when torch.compile torch.autograd.grad.
# trace through the joint function when torch.compile torch.autograd.grad.
true_bw_gm = materialize_as_graph(
ctx._true_bw_fn,
args,
@ -552,7 +552,7 @@ def _merge_output(
...
Case 2: At least one dimension has size 1, which can produce duplicates in strides.
In this case, theorectically, we cannot uniquely determine the expr of strides because
In this case, theoretically, we cannot uniquely determine the expr of strides because
the accessing stride_expr with same key in different order causes the final stride expression
to be different.
@ -562,7 +562,7 @@ def _merge_output(
merged_size: (u0, u1)
The stride expr could either be (u1, 1) or (1, u0) depending on whether we start with u1 or u0.
For this reason, we try to break tie by sorting via decending index so we always get (u1, 1).
For this reason, we try to break tie by sorting via descending index so we always get (u1, 1).
Note that backend might optimize the strides anyway so this is usually not a problem as long
as two branches matches. See relevant discussions in https://github.com/pytorch/pytorch/issues/142024.

View File

@ -560,9 +560,9 @@ def _(ctx, subgraph, identifier, *operands):
# We call auto_functionalized_v2 to support input mutation of invoke_subgraph.
# See NOTE [Support input mutation of hops] for the overall design.
#
# invoke_subgraph is special because of its identifier based caching machanism.
# invoke_subgraph is special because of its identifier based caching mechanism.
# In invoke_subgraph's functionalization key implementation, we create a new
# identifer because the subgraph is replaced by FunctionWithNoFreeVars in a
# identifier because the subgraph is replaced by FunctionWithNoFreeVars in a
# functional + epilogue form.
assert isinstance(identifier, str), identifier
return do_auto_functionalize_v2(
@ -635,7 +635,7 @@ def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, *operands):
# with a previously cached identifier, the corresponding graph module might not
# exist as a submodule in the new tracer's root. Therefore, we register it as a submodule below.
#
# The alternative is to give a new identifer when we re-trace the invoke_subgraph but this will increase
# The alternative is to give a new identifier when we re-trace the invoke_subgraph but this will increase
# the compilatoin time, which defeats the purpose of caching.
registered_before = False
for (

View File

@ -117,7 +117,7 @@ def map(
*args: TypeVarTuple,
):
r"""
Perfoms a map of f with xs. Intuitively, you can think of the semantic being:
Performs a map of f with xs. Intuitively, you can think of the semantic being:
out = []
for idx in len(xs.size(0)):

View File

@ -135,7 +135,7 @@ def scan(
and the second output of ``combine_fn`` represents a slice of the output.
This function must be pure, i.e., no lifted arguments are supported at the moment
and may not have any side effects.
init (torch.Tensor or pytree with tensor leaves): The inital scan carry, a tensor, or nested pytree of tensors.
init (torch.Tensor or pytree with tensor leaves): The initial scan carry, a tensor, or nested pytree of tensors.
The ``init`` is expected to have the same pytree structure as the first output element (i.e. carry)
of ``combine_fn``.
xs (torch.Tensor or pytree with tensor leaves): The input tensor, or nested pytree of tensors.
@ -154,7 +154,7 @@ def scan(
- The combine_fn shouldn't have any aliasing between input-input, input-output, and output-output. E.g. return a view
or the same tensor as input is not supported. As a workaround, can clone the output to avoid aliasing.
- The combine_fn shoudn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue
- The combine_fn shouldn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue
if you input mutation support for training is needed.
- The combine_fn's init carry should match the next_carry in pytree structure and in tensor metadata.
@ -585,7 +585,7 @@ class ScanAutogradOp(torch.autograd.Function):
carry, y = _extract_carry_and_out(combine_fn(*args), num_leaves_init)
return [
*carry,
# We additionally checkpoint all the intemediate carry outputs for backward.
# We additionally checkpoint all the intermediate carry outputs for backward.
*[
n_c.clone().detach() if isinstance(n_c, torch.Tensor) else n_c
for n_c in carry
@ -793,7 +793,7 @@ class ScanAutogradOp(torch.autograd.Function):
# Prepare the bwd_init
bwd_init = [*initial_g_additional_inputs, *g_c_T]
# 5.) Perform the backwrad scan:
# 5.) Perform the backward scan:
# The ``combine_fn_bw_wrapped`` receives the
# initial_g_additional_inputs and the last carry as the ``bwd_init`` and the
# gradients of the outputs (g_ys), as well as the fw_carries and the fw_xs of the forward as the ``bwd_xs``

View File

@ -18,7 +18,7 @@ class HopArgumentInfo:
example_value: Any
# Provide an default_value
default_value: Any
# Whether this arugment gets mutated in the hop subgraph.
# Whether this argument gets mutated in the hop subgraph.
# For output, this should always be False
is_mutated: bool
kw_only: bool

View File

@ -136,7 +136,7 @@ def inner(mode, *args, **kwargs):
# When tracing with fake script object, the call_torchbind op will return a fake tensor
# When tracing with real script object, the call_torchbind op may return a real tensor,
# we need to convert it to fake tensor mannually. Dynamic shape is surpported.
# we need to convert it to fake tensor manually. Dynamic shape is supported.
@call_torchbind.py_impl(FakeTensorMode)
def call_torchbind_fake(mode, *args, **kwargs):
with mode:

View File

@ -1037,7 +1037,7 @@ def triton_kernel_wrapper_mutation_dense(
# as we need to launch the kernel here, we "unwrap" the
# tma_descriptor_metadata, create the TMA descriptors
# from it, and replace the tensors in the kwargs by the
# correspoinding TMA descriptors before launching
# corresponding TMA descriptors before launching
kwargs = kwargs.copy()
for k, v in tma_descriptor_metadata.items():
tensor = kwargs[k]

View File

@ -852,7 +852,7 @@ def check_input_alias_and_mutation_return_outputs(
# Clone the fake args to avoid mutating the original fake args
with ExitStack() as ctx_stack:
# We need to re-use prev_fake_mode's shape env to resolve
# We need to reuse prev_fake_mode's shape env to resolve
# the runtime assertions for unbacked symbols.
new_fake_mode = torch._subclasses.FakeTensorMode(
shape_env=_get_shape_env(fake_args),

View File

@ -107,9 +107,9 @@ def while_loop(cond_fn, body_fn, carried_inputs):
- body_fn and cond_fn must not in-place mutate the carried_inputs. A clone before the mutation is required.
- body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn.
- body_fn and cond_fn must not mutate python variables (e.g. list/dict) created outside of the body_fn.
- body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required.
- body_fn and cond_fn's output cannot alias any of the inputs. A clone is required.
.. warning::
Temporal Limitations:
@ -279,8 +279,8 @@ def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs
# For this reason, we treat int, symint outputs in the same way:
# - they can match against any of int, symint carry
# - we unspecialize them with new unbacked symints in fake while_loop
# Similarly, we could do some analysis to refine the output ranges but it's eaiser to start with
# fresh unbacked symints. One suprising case can be: an input unbacked symint is constrained by
# Similarly, we could do some analysis to refine the output ranges but it's easier to start with
# fresh unbacked symints. One surprising case can be: an input unbacked symint is constrained by
# users to be >= 0 (either before while_loop or inside body_fn) and it increments by 1 in each
# iteration. Ideally, we should know that the final output is >= 0 but we didn't constrain the
# unbacked symint output of subgraph as of today because this requires a smart range analysis.

View File

@ -14,7 +14,7 @@ python profile_analysis.py --analysis <input_json_profile> <default_dtype>
- `default_dtype`: The default dtype of the model. Sometimes the dtypes of the kernel inputs are not available in the profile, so we use the default dtype to infer the dtypes of the inputs.
## Diff
This mode will diff two different profiles and output a table of the differences. It groups by kernel name, which can fail to properly match accross hardware vendors. More intelligent grouping coming soon.
This mode will diff two different profiles and output a table of the differences. It groups by kernel name, which can fail to properly match across hardware vendors. More intelligent grouping coming soon.
### Usage
```
@ -25,7 +25,7 @@ python profile_analysis.py --diff <json_profile_1> <profile_name_1> <json_profil
- `json_profile_1` `json_profile_2`: The json profile files generated by `torch.profile.export_chrome_trace()`.
- `profile_name_1` `profile_name_2`: The name of the profile. This is used to identify the profile in the output table.
- `default_dtype`: The default dtype of the model. Sometimes the dtypes of the kernel inputs are not available in the profile, so we use the default dtype to infer the dtypes of the inputs.
- `name_limit`: The maximum number of characters in the kernel name (they can be quite lengthly and hard to read).
- `name_limit`: The maximum number of characters in the kernel name (they can be quite lengthy and hard to read).
## Augment
This mode will add post-hoc analysis to a profile. Currently, it will add the flops and the memory reads of a kernel via formula (it's not looking at program counters or anything.) These, combined with the kernel duration, can be use to calculate achieved flops, achieved memory bandwidth, and roofline calculations.

View File

@ -310,7 +310,7 @@ def _create_extern_mapping(
data: dict[str, Any],
) -> defaultdict[int, list[dict[str, Any]]]:
"""
compute a mapping from exteral ids to non kernels, which contain the information we need to estimate flops etc
compute a mapping from external ids to non kernels, which contain the information we need to estimate flops etc
"""
extern_mapping: defaultdict[int, list[dict[str, Any]]] = defaultdict(list)
for event in data["traceEvents"]:
@ -402,7 +402,7 @@ class JsonProfile:
dtype: Optional[Union[torch.dtype, str]] = None,
):
"""
Convienence class for running common operations on chrome/perfetto json traces.
Convenience class for running common operations on chrome/perfetto json traces.
"""
self.path = path
with open(path) as f:

View File

@ -1829,7 +1829,7 @@ class AotCodeCompiler:
consts_asm += f"{symbol_prefix}_binary_constants_bin_end:\n"
return consts_asm, "S"
# Use c++ to comvert consts to object file can support more compilers, such as msvc and icx.
# Use c++ to convert consts to object file can support more compilers, such as msvc and icx.
def format_consts_to_cpp(
consts: bytes, align_bytes: int, symbol_prefix: str
) -> tuple[str, str]:

View File

@ -663,7 +663,7 @@ def _sink_waits_iterative_internal(
data_dep = o.get_name()
break
# 1. If we have data_dep - we can not swap => trying to group
# 2. If swap candidate and current node boths contain collectives => trying to group
# 2. If swap candidate and current node both contain collectives => trying to group
if data_dep is not None or (
both_contain_comms := (
contains_collective(wait_gsnode)

View File

@ -1235,7 +1235,7 @@ class CachingAutotuner(KernelInterface):
if launcher.store_cubin and (not benchmark_run or not self.cuda_kernel_saved):
self.save_gpu_kernel(stream, launcher)
# PyTorch execution trace replay calls CachingAutotuner::run() instread of calls launcher
# PyTorch execution trace replay calls CachingAutotuner::run() instead of calls launcher
# so _RecordFunctionFast need to capture the args into CachingAutotuner::run()
# make a copy here to avoid mutating the original args
args_without_constexprs = tuple(args)

View File

@ -56,9 +56,9 @@ class ReturnValueHandler:
r"""
When ltc_sync_multi is called on multi tensors, the compiled graph
will contain output only for unique tensors - if a tensor appears multiple
times in the input to _ltc_sync_multi, only the first occurance matters.
times in the input to _ltc_sync_multi, only the first occurrence matters.
However from python level, we still expect multi tensors returned with duplciation
However from python level, we still expect multi tensors returned with duplication
even if the TS graph dedup the output. e.g. for method:
def forward(self, a):
@ -123,7 +123,7 @@ def force_lazy_device(model: fx.GraphModule):
# To force those tensors on the lazy device, we can not simply override
# the device argument since there is no explicit device argument.
# What we are doing here is, for the list of covered tensor factory methods
# we add a lazy device argument explicity.
# we add a lazy device argument explicitly.
#
# TODO: This solution is no ideal since we may miss some factory methods. In future
# when we support lazy mode, this method can be replaced by that.
@ -170,7 +170,7 @@ def extract_compiled_graph(model: fx.GraphModule, example_inputs) -> Callable:
if len(fallback_ops) > 0:
raise RuntimeError(
f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}"
f"Fail to extract the compiled graph because of fallback: {','.join(fallback_ops)}"
)
if not isinstance(lazy_out, (tuple, list)):

View File

@ -13,7 +13,7 @@ def counter_names():
def counter_value(name: str):
"""Return the value of the counter with the speficied name"""
"""Return the value of the counter with the specified name"""
return torch._C._lazy._counter_value(name)

View File

@ -137,7 +137,7 @@ def maybe_to_fake_obj(
# x.__obj_flatten__() could be calling some tensor operations inside but we don't
# want to call these ops in surrounding dispatch modes when executing it.
# Otherwise, for example, the fake tensor modes will error out when the tensors inside
# script obeject execute some operations like clone if allow_non_fake_input flag is set.
# script object execute some operations like clone if allow_non_fake_input flag is set.
with _disable_current_modes():
flat_x = x.__obj_flatten__() # type: ignore[attr-defined]
@ -238,8 +238,8 @@ def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal]
def size(self):
return len(self.queue)
In this example, the original TensorQeue need to addd a __obj_flatten__ method
to the class TensorQueue and the flattend result is passed into FakeTensorQueue's
In this example, the original TensorQeue need to add a __obj_flatten__ method
to the class TensorQueue and the flattened result is passed into FakeTensorQueue's
__obj_unflatten__ as inputs to create a fake class. This protocol allows pytorch to look
at the contents of the script object and properly handle them in the subsystems
like dynamo, aot_aotugrad or more.
@ -248,7 +248,7 @@ def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal]
def inner(fake_class: HasStaticMethodFromReal):
ns, name = parse_namespace(qualname)
# This also checks whether the refered torch::class_ exists.
# This also checks whether the referred torch::class_ exists.
torch._C._get_custom_class_python_wrapper(ns, name)
from_method = getattr(fake_class, _CONVERT_FROM_REAL_NAME, None)

View File

@ -102,7 +102,7 @@ def unsafe_generate_fake_kernels(op_profiles: dict[str, set[OpProfile]]) -> Gene
an output with the same metadata as in the recorded profile. If a profile
doesn't exist then an exception will be thrown.
The fake kernel generation is considerd unsafe because it relies on the
The fake kernel generation is considered unsafe because it relies on the
rigid, pre-defined operator profiles that do not account for potential
variations in output behavior. Specifically, the generated kernels assume a
fixed relationship between input and output ranks. However, in reality, it's

View File

@ -14,7 +14,7 @@ from torch._ops import OpOverload
def warn_deploy(stacklevel=3):
warnings.warn(
"Python torch.library APIs do nothing under torch::deploy (multipy). "
"Python torch.library APIs do nothing under torch::deploy (multipy). " # codespell:ignore multipy
"Please instead use C++ custom operator registration APIs.",
RuntimeWarning,
stacklevel=stacklevel,
@ -442,7 +442,7 @@ class MutationChecker:
f"{self.op._name}: for argument '{info.name}': the operator's schema "
f"{self.op._schema} specified that "
f"the operator {'mutates' if info.is_write else 'does not mutate'} "
f"the argument, but this seems to be emperically wrong. "
f"the argument, but this seems to be empirically wrong. "
f"Please make the schema and operator behavior consistent. "
f"You can specify that an operator mutates a Tensor by "
f"e.g. changing its schema type from 'Tensor name' to 'Tensor(a!) name'"

View File

@ -1,6 +1,6 @@
# mypy: ignore-errors
"""Dtypes/scalar type implementaions with torch dtypes.
"""Dtypes/scalar type implementations with torch dtypes.
Here `dtype` is always a torch.dtype, this module knows nothing about
scalar types, wrapper dtypes or anything like that. PyTorch only.

View File

@ -96,7 +96,7 @@ def _concat_cast_helper(tensors, out=None, dtype=None, casting="same_kind"):
else:
out_dtype = _dtypes_impl.result_type_impl(*tensors)
# cast input arrays if necessary; do not broadcast them agains `out`
# cast input arrays if necessary; do not broadcast them against `out`
tensors = _util.typecast_tensors(tensors, out_dtype, casting)
return tensors
@ -1290,7 +1290,7 @@ def cross(a: ArrayLike, b: ArrayLike, axisa=-1, axisb=-1, axisc=-1, axis=None):
def einsum(*operands, out=None, dtype=None, order="K", casting="safe", optimize=False):
# Have to manually normalize *operands and **kwargs, following the NumPy signature
# We have a local import to avoid poluting the global space, as it will be then
# We have a local import to avoid polluting the global space, as it will be then
# exported in funcs.py
from ._ndarray import ndarray
from ._normalizations import (

View File

@ -204,7 +204,7 @@ def _coerce_to_tensor(obj, dtype=None, copy=False, ndmin=0):
Notes
-----
This is almost a "tensor_like" coersion function. Does not handle wrapper
This is almost a "tensor_like" coercive function. Does not handle wrapper
ndarrays (those should be handled in the ndarray-aware layer prior to
invoking this function).
"""

View File

@ -2174,7 +2174,7 @@ def _resize_aten(a: Tensor, shape: ShapeType) -> Tensor:
_resize_doc = """
Gives a tensor with no elements a new shape, returning the modified tensor.
The tensor's strides are contiguous and its values are unitialized.
The tensor's strides are contiguous and its values are uninitialized.
"""
# TODO: review support arbitrary resizes

View File

@ -94,7 +94,7 @@ class elementwise_type_promotion_wrapper:
Takes two kwargs, type_promoting_args and type_promotion_kind.
type_promoting_args must be a string Sequence specifiying the argument names of all
type_promoting_args must be a string Sequence specifying the argument names of all
arguments that participate in type promotion (and should be type promoted). If the
arg specifies a Sequence-type then every element of the Sequence will participate in
type promotion.

View File

@ -6077,7 +6077,7 @@ def bucketize(
if n_boundaries == 0:
return torch.zeros_like(a)
# We are trying to find the bucket (defined by pairs of consecutive elements of `boundaries`)
# each element of `a` belongs to. We use binary search to achieve logarithimic complexity,
# each element of `a` belongs to. We use binary search to achieve logarithmic complexity,
# but each step of the search is done "in parallel" over all elements of `a`
# can't use int32 as indexes, so we have to do all computations with int64 and convert at the end
start = torch.zeros(a.shape, device=a.device, dtype=torch.int64)

View File

@ -760,7 +760,7 @@ def _nll_loss_nd(
batch_size = input.shape[0]
loss = -input[torch.arange(batch_size), target] * current_weight
else:
# 3D case (N batch size, C classe, K dimensions)
# 3D case (N batch size, C classes, K dimensions)
# input (N batch size, C classes, K)
batch_size = input.shape[0]
extent = input.shape[2]

View File

@ -59,7 +59,7 @@ class StrobelightCLIFunctionProfiler:
StrobelightCLIFunctionProfiler can be used to profile a python function and
generate a strobelight link with the results. It works on meta servers but
does not requries an fbcode target.
does not requires an fbcode target.
When stop_at_error is false(default), error during profiling does not prevent
the work function from running.

View File

@ -127,7 +127,7 @@ class StrobelightCompileTimeProfiler:
if not shutil.which("strobeclient"):
logger.info(
"strobeclient not found, cant enable compile time strobelight profiling, seems"
"strobeclient not found, can't enable compile time strobelight profiling, seems"
"like you are not on a FB machine."
)
return

View File

@ -231,7 +231,7 @@ def stride_incorrect_op(op):
# These operators have meta implementations with incorrect strides
@register_op_impl(stride_incorrect_op)
def wordaround_stride_incorrect_op(fake_mode, func, *args, **kwargs):
# This is a workaround for meta implmentations with incorrect strides
# This is a workaround for meta implementations with incorrect strides
def is_symbolic(x):
if isinstance(x, FakeTensor):

View File

@ -2366,7 +2366,7 @@ class FakeTensorMode(TorchDispatchMode):
# (aot autograd, torchdynamo) where each operation is run consecutively.
# Because each operation is run in order, we can trace out and support
# sequences like: x = torch.tensor(0.); y = x.add_(1)
# Whenver a constant is written to but with inputs that cannot be evaluated
# Whenever a constant is written to but with inputs that cannot be evaluated
# statically, such as random_(), we invalidate all constants that alias the input
# We will rely on functionalization for use of fake tensors constants as persistent
# objects on an FX Graph.

View File

@ -67,7 +67,7 @@ class FunctionalTensor(torch.Tensor):
# later, as long as it doesn't break anything).
# FunctionalTensorWrapper copies **all** dispatch keys from the inner tensor
# to the wrapper, excluding functorch and python dispatch keys.
# Here I'm trying to re-use the keyset the functorch wrapper subclasses copy,
# Here I'm trying to reuse the keyset the functorch wrapper subclasses copy,
# except that they don't include ZeroTensor so I'm manually adding it in.
_extra_dispatch_keys = torch._C._additional_keys_to_prop_for_wrapper_tensors.add(
torch._C.DispatchKey.ZeroTensor
@ -488,7 +488,7 @@ class FunctionalTensorMode(TorchDispatchMode):
- FunctionalTensor._extra_dispatch_keys
)
# All we want to do here is re-use the existing C++ functionalization logic.
# All we want to do here is reuse the existing C++ functionalization logic.
# This requires swizzling our TLS dispatch keys so that the Functionalize key is active.
with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
try:

View File

@ -1643,7 +1643,7 @@ class MetaConverter(Generic[_TensorT]):
with torch.enable_grad():
r = view_from_base(base, t)
# NB: We don't actaully faithfully replicate
# NB: We don't actually faithfully replicate
# autograd connectivity, but that doesn't matter
# today. See following for more info:
# https://gist.github.com/soulitzer/e03f015b314c3f5fcf80888c69390913

View File

@ -348,7 +348,7 @@ c10::intrusive_ptr<OwnerRRef> RRefContext::getOrCreateOwnerRRef(
// here is a plain TensorType, they are not equal relationship:
// specialized TensorType <: plain TensorType
//
// In RPC we don't care the difference as we ser'de with just the
// In RPC we don't care the difference as we Ser/De with just the
// plain TensorType. This is not a issue for UserRRef creation either,
// since Tensor can only get specialized with a previous run of local
// JIT function, and we shouldn't preserve the specialized SubTensorType