[BE][15/16] fix typos in torch/ (torch/distributed/tensor/) (#156605)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156605
Approved by: https://github.com/wanchaol, https://github.com/albanD
This commit is contained in:
Xuehai Pan 2025-07-17 14:55:13 +08:00 committed by PyTorch MergeBot
parent eeda1a75ac
commit 3f8e2e91ad
26 changed files with 37 additions and 39 deletions

View File

@ -1169,7 +1169,6 @@ exclude_patterns = [
'aten/src/ATen/[a-mA-M]*/**',
'test/**',
'test/[a-hA-h]*/**',
'torch/distributed/tensor/**',
]
init_command = [
'python3',

View File

@ -179,7 +179,7 @@ class UtilTest(DTensorTestBase):
)
with self.assertRaisesRegex(
RuntimeError,
"Non-sharded dimentions should have identical size across ranks.",
"Non-sharded dimensions should have identical size across ranks.",
):
_ = compute_global_tensor_shape(
local_shape,

View File

@ -4,9 +4,8 @@ BU
contiguities
contiguity
coo
Din
Dout
dOut
din
dout
ElementE
followings
fro

View File

@ -571,7 +571,7 @@ class DTensor(torch.Tensor):
"""
Return the full tensor of this DTensor. It will perform necessary collectives
to gather the local tensors from other ranks in its DeviceMesh and concatenate
them together. It's a syntatic sugar of the following code:
them together. It's a syntactic sugar of the following code:
``dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()``
@ -1011,7 +1011,7 @@ def _dtensor_init_helper( # type: ignore[no-untyped-def]
# set default placements to replicated if not specified
placements = placements or tuple(Replicate() for _ in range(device_mesh.ndim))
# check device_mesh againts placements
# check device_mesh against placements
assert device_mesh.ndim == len(placements), (
"mesh dimension does not match the length of placements"
)

View File

@ -316,7 +316,7 @@ def redistribute_cost(
NOTE:
1. Only consider communication cost here, since computation costs for redistribute
are quite trival (i.e. we only need to narrow or simple division)
are quite trivial (i.e. we only need to narrow or simple division)
2. Only consider redistribute cost on same mesh, cross mesh communication cost is
not quite needed for operator strategy estimation/selection.
"""

View File

@ -434,7 +434,7 @@ class OpDispatcher:
"Found a non-scalar tensor with numel=1 and ndim!=0, "
"we are implicitly creating a replicated DTensor for it. "
"However, please consider changing it to a scalar tensor "
"or explicitly create a DTensor under distributed enviroment."
"or explicitly create a DTensor under distributed environment."
)
if tensor_arg.numel() == 1 or self._allow_implicit_replication:

View File

@ -244,7 +244,7 @@ class DTensorSpec:
if placement.is_shard():
placement = cast(Shard, placement)
raise RuntimeError(
f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
f"DeviceMesh dimension can't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
)
elif placement.is_partial():
raise RuntimeError(

View File

@ -28,7 +28,7 @@ KwargsType = dict[str, object]
PlacementList = list[Optional[Placement]]
# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type should
# be the same set of possibilities.
OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]

View File

@ -113,7 +113,7 @@ class _MaskPartial(Partial):
def _reduce_value(
self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
) -> torch.Tensor:
# by the time we ned reduction, we should have already saved the mask
# by the time we need reduction, we should have already saved the mask
assert self.mask_buffer.data is not None
# apply the mask to the tensor that pending reduction
@ -134,7 +134,7 @@ class _MaskPartial(Partial):
mesh_dim: int,
shard_spec: Placement,
) -> torch.Tensor:
# by the time we ned reduction, we should have already saved the mask
# by the time we need reduction, we should have already saved the mask
assert self.mask_buffer.data is not None
# apply the mask to the tensor that pending reduction

View File

@ -1085,7 +1085,7 @@ def topk_strategy(op_schema: OpSchema) -> OpStrategy:
if dim != topk_dim:
dim_shardings: PlacementList = [Shard(dim)] * 3
single_mesh_dim_strategies.append(dim_shardings)
# TODO: topk on sharded dim requries non-trival reduction, address it later
# TODO: topk on sharded dim requires non-trival reduction, address it later
return expand_to_full_mesh_op_strategy(
input_strategy.mesh, op_schema, single_mesh_dim_strategies, input_index=2

View File

@ -704,7 +704,7 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
None, # max_k
None, # philox_seed
None, # philox_offset
# NOTE: debug_attn_mask is not supproted by pytorch and is always an empty tensor
# NOTE: debug_attn_mask is not supported by pytorch and is always an empty tensor
# https://github.com/pytorch/pytorch/blob/60205b0eb2602317856312a66d955c88334ade0b/aten/src/ATen/native/transformers/cuda/attention.cu#L839-L840
debug_attn_mask_sharding, # debug_attn_mask
Replicate(), # q

View File

@ -300,7 +300,7 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
Flatten((InputDim(1), InputDim(2)))
)
- ouptut dimension 0 maps to input dimension 0
- output dimension 0 maps to input dimension 0
- output dimension 1 maps to a flattened input dimensions 1 and 2

View File

@ -216,7 +216,7 @@ def map_placements_after_broadcast(
# the input shape shard dim before broadcasting,
# in this case it means implicit broadcasting happen
# in this dim, so we can just mark it as replicate
# and implict broadcast will broadcast automatically
# and implicit broadcast will broadcast automatically
# to the sharded shape
new_placements.append(Replicate())

View File

@ -27,7 +27,7 @@ aten = torch.ops.aten
class LocalShardsWrapper(torch.Tensor):
"""
A wrapper class to hold local shards of a DTensor.
This class is used largely for checkpointing purposes and implicity subtypes
This class is used largely for checkpointing purposes and implicitly subtypes
the _Checkpointable protocol.
"""
@ -159,7 +159,7 @@ class LocalShardsWrapper(torch.Tensor):
]
elif args[0].local_shards()[0].ndim == 1:
assert args[0].storage_metadata().size[0] == view_shape[0]
# This case is for optimizer sharding as regardles of sharding type, optimizer state is row wise sharded
# This case is for optimizer sharding as regardless of sharding type, optimizer state is row wise sharded
res_shards_list = [
aten.view.default(shard, shard.shape, **kwargs)
for shard in args[0].local_shards()

View File

@ -296,7 +296,7 @@ def compute_global_tensor_shape(
for shape_tensor in gathered_shaped_tensors:
if not torch.equal(local_shape[other_dims], shape_tensor[other_dims]):
raise RuntimeError(
"Non-sharded dimentions should have identical size across ranks."
"Non-sharded dimensions should have identical size across ranks."
)
shape_tensor_list = shape_tensor.tolist()
sharded_dim_sum += shape_tensor_list[shard_dim]

View File

@ -395,7 +395,7 @@ class CommDebugMode(TorchDispatchMode):
json_dict: dict[str, Any] = {}
add_json_information(json_dict, "Global")
# converts dictonary into json file
# converts dictionary into json file
with open(file_name, "w") as json_file:
json.dump(json_dict, json_file, indent=4)

View File

@ -711,7 +711,7 @@ class CommDebugModeExample:
def run_example(world_size: int, rank: int, example_name: str) -> None:
# set manual seed
# intializing class with all of the functions
# initializing class with all of the functions
instantiated_example = CommDebugModeExample(world_size, rank)
# dict that stores example code function names
name_to_example_code: dict[str, Callable[[], None]] = {

View File

@ -1,7 +1,7 @@
# mypy: allow-untyped-defs
"""
The following example demonstrates how to train a ConvNeXt model
with intermediate activations sharded across mutliple GPUs via DTensor
with intermediate activations sharded across multiple GPUs via DTensor
To run the example, use the following command:
torchrun --standalone --nnodes=1 --nproc-per-node=4 convnext_example.py

View File

@ -231,7 +231,7 @@ def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):
# note: for uneven sharding, we need to specify the shape and stride because
# DTensor would assume even sharding and compute shape/stride based on the
# assumption. Torchrec needs to pass in this information explicitely.
# assumption. Torchrec needs to pass in this information explicitly.
# shape/stride are global tensor's shape and stride
dtensor = DTensor.from_local(
local_shards_wrapper, # a torch.Tensor subclass
@ -324,7 +324,7 @@ def run_torchrec_table_wise_sharding_example(rank, world_size):
# create a DTensor from the local shard for the current table
# note: for uneven sharding, we need to specify the shape and stride because
# DTensor would assume even sharding and compute shape/stride based on the
# assumption. Torchrec needs to pass in this information explicitely.
# assumption. Torchrec needs to pass in this information explicitly.
dtensor = DTensor.from_local(
local_shards,
device_submesh,

View File

@ -239,7 +239,7 @@ class _AllToAllRotater(_RingRotater):
class _AllGatherRotater(_RingRotater):
"""
Allgather the kv and return the only the requried kv.
Allgather the kv and return the only the required kv.
Only one communication will be done.
"""
@ -277,7 +277,7 @@ def _create_rotater(
elif method == _RotateMethod.ALL_GATHER:
return _AllGatherRotater(pg, seq_dim)
else:
raise NotImplementedError(f"Unkonwn method {method}")
raise NotImplementedError(f"Unknown method {method}")
def _templated_ring_attention(
@ -339,12 +339,12 @@ def _templated_ring_attention(
First Iteration: Both ranks perform SDPA with their local qkv pairs, similar to the
no-load-balance case. This iteration corresponds to the `if` of the
(`if, `elif`, `else`) in the implemementation.
(`if, `elif`, `else`) in the implementation.
Second Iteration: Rank0 now has (q0, q3) and (k1, k2); rank1 has (q1, q2) and
(k0, k3). For rank0, no computation is needed for q0. However, computations for
q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
`else` of the (`if`, `elif`, `else`) in the implemementation.
`else` of the (`if`, `elif`, `else`) in the implementation.
For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.
@ -916,7 +916,7 @@ def _distribute_function(
the inputs and outputs of a function. Similar to ``distribute_module``, this API
installs hooks to the ``fn`` to convert the inputs and outputs. There are two
major differences between ``distribute_function`` and ``distribute_module``.
First, a function does not have parammeters and buffers, as a result,
First, a function does not have parameters and buffers, as a result,
``distribute_function`` itself won't convert any parameters/buffers but simply
install the input and output hooks. The tensor conversion will happen in the hooks.
Another difference is an nn.Module subclass can have several instances and each
@ -932,9 +932,9 @@ def _distribute_function(
``fn_module`` is ``torch.nn.functional``.
device_mesh (:class:`DeviceMesh`): the device mesh that will be used by the
input and output hooks to distribute the tensors.
input_fn (Optioinal[Callable]): the hook to distribute or convert the input
input_fn (Optional[Callable]): the hook to distribute or convert the input
arguments of ``fn``.
output_fn (Optioinal[Callable]): the hook to distribute or convert the output
output_fn (Optional[Callable]): the hook to distribute or convert the output
arguments of ``fn``.
"""
@ -989,7 +989,7 @@ class _AttentionContextParallel(ParallelStyle):
Applies context parallel optimizations to the attention layer.
This will work for nn.MultiHeadedAttention and custom attention layers that
call F.scaled_dotproduct_attention with a simliar signature.
call F.scaled_dotproduct_attention with a similar signature.
This expects the `forward` method consumes either:

View File

@ -112,7 +112,7 @@ def local_map(
>>> row_wise = [Shard(0)] # row-wise sharding placements on 1-d mesh
>>> col_wise = [Shard(1)] # col-wise sharding placements on 1-d mesh
>>>
>>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor convertion
>>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor conversion
>>> local_mm_allreduce_forward = local_map(
>>> mm_allreduce_forward,
>>> out_placements=[Replicate()],

View File

@ -41,7 +41,7 @@ def register_sharding(op: Union[OpOverload, list[OpOverload]]):
as the original op (except that if an arg is a :class:`torch.Tensor`, it will be
replaced by a tensor-like object that DTensor uses internally). The function should
return a sequence of 2-tuples, each specifying acceptable output placements and its
corresponding intput placements.
corresponding input placements.
Example:
>>> # xdoctest: +SKIP("distributed")

View File

@ -30,7 +30,7 @@ def _flatten_tensor(
@no_type_check
def _unflatten_tensor(tensor, spec, *, device_handle=None, compute_stream=None):
# unflatten would mainly be called everytime FSDP allgather parameters.
# unflatten would mainly be called every time FSDP allgather parameters.
result = DTensor.from_local(
tensor,
spec.mesh,

View File

@ -36,7 +36,7 @@ def _update_module_param(param_list: list[tuple[nn.Module, str, nn.Parameter]]):
def _reconstruct_dtensor(module: nn.Module, _input: Any):
"""
Recontruct DTensor parameters from local tensors
Reconstruct DTensor parameters from local tensors
"""
param_list = []
# TODO: To add perf optimizations to this iterations

View File

@ -326,7 +326,7 @@ class DTensorExtensions(FSDPExtensions):
super().__init__()
self.compute_stream = None
self.device_handle = device_handle
# we have to use the dynamo disable this way to disable dynamo as the decorater way would
# we have to use the dynamo disable this way to disable dynamo as the decorator way would
# trigger build failure with torch deploy...
self.post_unflatten_transform = torch._dynamo.disable( # type: ignore[method-assign]
self.post_unflatten_transform

View File

@ -701,7 +701,7 @@ class Partial(Placement):
# _partition_value: partition the value of a replicated tensor on the mesh dimension
# _partition_value is the conjugate operation of _reduce_value
# - i.e. _partition_value on a sum reduce op is just a divison operation
# - i.e. _partition_value on a sum reduce op is just a division operation
# - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
# TODO: if the reduce_op is min/max, etc. the _partition_value should be a
# different operation