[BE][15/16] fix typos in torch/ (torch/distributed/tensor/) (#156605)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156605 Approved by: https://github.com/wanchaol, https://github.com/albanD
2025-12-06 12:20:52 +01:00 · 2025-07-17 14:55:13 +08:00 · 2025-07-17 14:55:13 +08:00 · 3f8e2e91ad
commit 3f8e2e91ad
parent eeda1a75ac
26 changed files with 37 additions and 39 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1169,7 +1169,6 @@ exclude_patterns = [
    'aten/src/ATen/[a-mA-M]*/**',
    'test/**',
    'test/[a-hA-h]*/**',
-    'torch/distributed/tensor/**',
 ]
 init_command = [
    'python3',
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@ -179,7 +179,7 @@ class UtilTest(DTensorTestBase):
        )
        with self.assertRaisesRegex(
            RuntimeError,
-            "Non-sharded dimentions should have identical size across ranks.",
+            "Non-sharded dimensions should have identical size across ranks.",
        ):
            _ = compute_global_tensor_shape(
                local_shape,
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@ -4,9 +4,8 @@ BU
 contiguities
 contiguity
 coo
-Din
-Dout
-dOut
+din
+dout
 ElementE
 followings
 fro
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@ -571,7 +571,7 @@ class DTensor(torch.Tensor):
        """
        Return the full tensor of this DTensor. It will perform necessary collectives
        to gather the local tensors from other ranks in its DeviceMesh and concatenate
-        them together. It's a syntatic sugar of the following code:
+        them together. It's a syntactic sugar of the following code:

        ``dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()``

@ -1011,7 +1011,7 @@ def _dtensor_init_helper(  # type: ignore[no-untyped-def]
    # set default placements to replicated if not specified
    placements = placements or tuple(Replicate() for _ in range(device_mesh.ndim))

-    # check device_mesh againts placements
+    # check device_mesh against placements
    assert device_mesh.ndim == len(placements), (
        "mesh dimension does not match the length of placements"
    )
--- a/torch/distributed/tensor/_collective_utils.py
+++ b/torch/distributed/tensor/_collective_utils.py
@ -316,7 +316,7 @@ def redistribute_cost(

    NOTE:
    1. Only consider communication cost here, since computation costs for redistribute
-       are quite trival (i.e. we only need to narrow or simple division)
+       are quite trivial (i.e. we only need to narrow or simple division)
    2. Only consider redistribute cost on same mesh, cross mesh communication cost is
       not quite needed for operator strategy estimation/selection.
    """
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@ -434,7 +434,7 @@ class OpDispatcher:
                "Found a non-scalar tensor with numel=1 and ndim!=0, "
                "we are implicitly creating a replicated DTensor for it. "
                "However, please consider changing it to a scalar tensor "
-                "or explicitly create a DTensor under distributed enviroment."
+                "or explicitly create a DTensor under distributed environment."
            )

        if tensor_arg.numel() == 1 or self._allow_implicit_replication:
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@ -244,7 +244,7 @@ class DTensorSpec:
                if placement.is_shard():
                    placement = cast(Shard, placement)
                    raise RuntimeError(
-                        f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+                        f"DeviceMesh dimension can't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
                    )
                elif placement.is_partial():
                    raise RuntimeError(
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@ -28,7 +28,7 @@ KwargsType = dict[str, object]

 PlacementList = list[Optional[Placement]]

-# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
+# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type should
 # be the same set of possibilities.
 OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]

--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@ -113,7 +113,7 @@ class _MaskPartial(Partial):
    def _reduce_value(
        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
    ) -> torch.Tensor:
-        # by the time we ned reduction, we should have already saved the mask
+        # by the time we need reduction, we should have already saved the mask
        assert self.mask_buffer.data is not None

        # apply the mask to the tensor that pending reduction
@ -134,7 +134,7 @@ class _MaskPartial(Partial):
        mesh_dim: int,
        shard_spec: Placement,
    ) -> torch.Tensor:
-        # by the time we ned reduction, we should have already saved the mask
+        # by the time we need reduction, we should have already saved the mask
        assert self.mask_buffer.data is not None

        # apply the mask to the tensor that pending reduction
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@ -1085,7 +1085,7 @@ def topk_strategy(op_schema: OpSchema) -> OpStrategy:
        if dim != topk_dim:
            dim_shardings: PlacementList = [Shard(dim)] * 3
            single_mesh_dim_strategies.append(dim_shardings)
-    # TODO: topk on sharded dim requries non-trival reduction, address it later
+    # TODO: topk on sharded dim requires non-trival reduction, address it later

    return expand_to_full_mesh_op_strategy(
        input_strategy.mesh, op_schema, single_mesh_dim_strategies, input_index=2
--- a/torch/distributed/tensor/_ops/_matrix_ops.py
+++ b/torch/distributed/tensor/_ops/_matrix_ops.py
@ -704,7 +704,7 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
        None,  # max_k
        None,  # philox_seed
        None,  # philox_offset
-        # NOTE: debug_attn_mask is not supproted by pytorch and is always an empty tensor
+        # NOTE: debug_attn_mask is not supported by pytorch and is always an empty tensor
        # https://github.com/pytorch/pytorch/blob/60205b0eb2602317856312a66d955c88334ade0b/aten/src/ATen/native/transformers/cuda/attention.cu#L839-L840
        debug_attn_mask_sharding,  # debug_attn_mask
        Replicate(),  # q
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@ -300,7 +300,7 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
            Flatten((InputDim(1), InputDim(2)))
        )

-    - ouptut dimension 0 maps to input dimension 0
+    - output dimension 0 maps to input dimension 0
    - output dimension 1 maps to a flattened input dimensions 1 and 2


--- a/torch/distributed/tensor/_ops/utils.py
+++ b/torch/distributed/tensor/_ops/utils.py
@ -216,7 +216,7 @@ def map_placements_after_broadcast(
                # the input shape shard dim before broadcasting,
                # in this case it means implicit broadcasting happen
                # in this dim, so we can just mark it as replicate
-                # and implict broadcast will broadcast automatically
+                # and implicit broadcast will broadcast automatically
                # to the sharded shape
                new_placements.append(Replicate())

--- a/torch/distributed/tensor/_shards_wrapper.py
+++ b/torch/distributed/tensor/_shards_wrapper.py
@ -27,7 +27,7 @@ aten = torch.ops.aten
 class LocalShardsWrapper(torch.Tensor):
    """
    A wrapper class to hold local shards of a DTensor.
-    This class is used largely for checkpointing purposes and implicity subtypes
+    This class is used largely for checkpointing purposes and implicitly subtypes
    the _Checkpointable protocol.
    """

@ -159,7 +159,7 @@ class LocalShardsWrapper(torch.Tensor):
                ]
            elif args[0].local_shards()[0].ndim == 1:
                assert args[0].storage_metadata().size[0] == view_shape[0]
-                # This case is for optimizer sharding as regardles of sharding type, optimizer state is row wise sharded
+                # This case is for optimizer sharding as regardless of sharding type, optimizer state is row wise sharded
                res_shards_list = [
                    aten.view.default(shard, shard.shape, **kwargs)
                    for shard in args[0].local_shards()
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@ -296,7 +296,7 @@ def compute_global_tensor_shape(
        for shape_tensor in gathered_shaped_tensors:
            if not torch.equal(local_shape[other_dims], shape_tensor[other_dims]):
                raise RuntimeError(
-                    "Non-sharded dimentions should have identical size across ranks."
+                    "Non-sharded dimensions should have identical size across ranks."
                )
            shape_tensor_list = shape_tensor.tolist()
            sharded_dim_sum += shape_tensor_list[shard_dim]
--- a/torch/distributed/tensor/debug/_comm_mode.py
+++ b/torch/distributed/tensor/debug/_comm_mode.py
@ -395,7 +395,7 @@ class CommDebugMode(TorchDispatchMode):
        json_dict: dict[str, Any] = {}
        add_json_information(json_dict, "Global")

-        # converts dictonary into json file
+        # converts dictionary into json file
        with open(file_name, "w") as json_file:
            json.dump(json_dict, json_file, indent=4)

--- a/torch/distributed/tensor/examples/comm_mode_features_example.py
+++ b/torch/distributed/tensor/examples/comm_mode_features_example.py
@ -711,7 +711,7 @@ class CommDebugModeExample:

 def run_example(world_size: int, rank: int, example_name: str) -> None:
    # set manual seed
-    # intializing class with all of the functions
+    # initializing class with all of the functions
    instantiated_example = CommDebugModeExample(world_size, rank)
    # dict that stores example code function names
    name_to_example_code: dict[str, Callable[[], None]] = {
--- a/torch/distributed/tensor/examples/convnext_example.py
+++ b/torch/distributed/tensor/examples/convnext_example.py
@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 """
 The following example demonstrates how to train a ConvNeXt model
-with intermediate activations sharded across mutliple GPUs via DTensor
+with intermediate activations sharded across multiple GPUs via DTensor

 To run the example, use the following command:
 torchrun --standalone --nnodes=1 --nproc-per-node=4 convnext_example.py
--- a/torch/distributed/tensor/examples/torchrec_sharding_example.py
+++ b/torch/distributed/tensor/examples/torchrec_sharding_example.py
@ -231,7 +231,7 @@ def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):

    # note: for uneven sharding, we need to specify the shape and stride because
    # DTensor would assume even sharding and compute shape/stride based on the
-    # assumption. Torchrec needs to pass in this information explicitely.
+    # assumption. Torchrec needs to pass in this information explicitly.
    # shape/stride are global tensor's shape and stride
    dtensor = DTensor.from_local(
        local_shards_wrapper,  # a torch.Tensor subclass
@ -324,7 +324,7 @@ def run_torchrec_table_wise_sharding_example(rank, world_size):
        # create a DTensor from the local shard for the current table
        # note: for uneven sharding, we need to specify the shape and stride because
        # DTensor would assume even sharding and compute shape/stride based on the
-        # assumption. Torchrec needs to pass in this information explicitely.
+        # assumption. Torchrec needs to pass in this information explicitly.
        dtensor = DTensor.from_local(
            local_shards,
            device_submesh,
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@ -239,7 +239,7 @@ class _AllToAllRotater(_RingRotater):

 class _AllGatherRotater(_RingRotater):
    """
-    Allgather the kv and return the only the requried kv.
+    Allgather the kv and return the only the required kv.
    Only one communication will be done.
    """

@ -277,7 +277,7 @@ def _create_rotater(
    elif method == _RotateMethod.ALL_GATHER:
        return _AllGatherRotater(pg, seq_dim)
    else:
-        raise NotImplementedError(f"Unkonwn method {method}")
+        raise NotImplementedError(f"Unknown method {method}")


 def _templated_ring_attention(
@ -339,12 +339,12 @@ def _templated_ring_attention(

    First Iteration: Both ranks perform SDPA with their local qkv pairs, similar to the
    no-load-balance case. This iteration corresponds to the `if` of the
-    (`if, `elif`, `else`) in the implemementation.
+    (`if, `elif`, `else`) in the implementation.

    Second Iteration: Rank0 now has (q0, q3) and (k1, k2); rank1 has (q1, q2) and
    (k0, k3). For rank0, no computation is needed for q0. However, computations for
    q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
-    `else` of the (`if`, `elif`, `else`) in the implemementation.
+    `else` of the (`if`, `elif`, `else`) in the implementation.
    For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
    corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.

@ -916,7 +916,7 @@ def _distribute_function(
    the inputs and outputs of a function. Similar to ``distribute_module``, this API
    installs hooks to the ``fn`` to convert the inputs and outputs. There are two
    major differences between ``distribute_function`` and ``distribute_module``.
-    First, a function does not have parammeters and buffers, as a result,
+    First, a function does not have parameters and buffers, as a result,
    ``distribute_function`` itself won't convert any parameters/buffers but simply
    install the input and output hooks.  The tensor conversion will happen in the hooks.
    Another difference is an nn.Module subclass can have several instances and each
@ -932,9 +932,9 @@ def _distribute_function(
            ``fn_module`` is ``torch.nn.functional``.
        device_mesh (:class:`DeviceMesh`): the device mesh that will be used by the
            input and output hooks to distribute the tensors.
-        input_fn (Optioinal[Callable]): the hook to distribute or convert the input
+        input_fn (Optional[Callable]): the hook to distribute or convert the input
            arguments of ``fn``.
-        output_fn (Optioinal[Callable]): the hook to distribute or convert the output
+        output_fn (Optional[Callable]): the hook to distribute or convert the output
            arguments of ``fn``.
    """

@ -989,7 +989,7 @@ class _AttentionContextParallel(ParallelStyle):
    Applies context parallel optimizations to the attention layer.

    This will work for nn.MultiHeadedAttention and custom attention layers that
-    call F.scaled_dotproduct_attention with a simliar signature.
+    call F.scaled_dotproduct_attention with a similar signature.

    This expects the `forward` method consumes either:

--- a/torch/distributed/tensor/experimental/_func_map.py
+++ b/torch/distributed/tensor/experimental/_func_map.py
@ -112,7 +112,7 @@ def local_map(
        >>> row_wise = [Shard(0)]  # row-wise sharding placements on 1-d mesh
        >>> col_wise = [Shard(1)]  # col-wise sharding placements on 1-d mesh
        >>>
-        >>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor convertion
+        >>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor conversion
        >>> local_mm_allreduce_forward = local_map(
        >>>     mm_allreduce_forward,
        >>>     out_placements=[Replicate()],
--- a/torch/distributed/tensor/experimental/_register_sharding.py
+++ b/torch/distributed/tensor/experimental/_register_sharding.py
@ -41,7 +41,7 @@ def register_sharding(op: Union[OpOverload, list[OpOverload]]):
        as the original op (except that if an arg is a :class:`torch.Tensor`, it will be
        replaced by a tensor-like object that DTensor uses internally). The function should
        return a sequence of 2-tuples, each specifying acceptable output placements and its
-        corresponding intput placements.
+        corresponding input placements.

    Example:
        >>> # xdoctest: +SKIP("distributed")
--- a/torch/distributed/tensor/parallel/_data_parallel_utils.py
+++ b/torch/distributed/tensor/parallel/_data_parallel_utils.py
@ -30,7 +30,7 @@ def _flatten_tensor(

@no_type_check
 def _unflatten_tensor(tensor, spec, *, device_handle=None, compute_stream=None):
-    # unflatten would mainly be called everytime FSDP allgather parameters.
+    # unflatten would mainly be called every time FSDP allgather parameters.
    result = DTensor.from_local(
        tensor,
        spec.mesh,
--- a/torch/distributed/tensor/parallel/ddp.py
+++ b/torch/distributed/tensor/parallel/ddp.py
@ -36,7 +36,7 @@ def _update_module_param(param_list: list[tuple[nn.Module, str, nn.Parameter]]):

 def _reconstruct_dtensor(module: nn.Module, _input: Any):
    """
-    Recontruct DTensor parameters from local tensors
+    Reconstruct DTensor parameters from local tensors
    """
    param_list = []
    # TODO: To add perf optimizations to this iterations
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@ -326,7 +326,7 @@ class DTensorExtensions(FSDPExtensions):
        super().__init__()
        self.compute_stream = None
        self.device_handle = device_handle
-        # we have to use the dynamo disable this way to disable dynamo as the decorater way would
+        # we have to use the dynamo disable this way to disable dynamo as the decorator way would
        # trigger build failure with torch deploy...
        self.post_unflatten_transform = torch._dynamo.disable(  # type: ignore[method-assign]
            self.post_unflatten_transform
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@ -701,7 +701,7 @@ class Partial(Placement):
        # _partition_value: partition the value of a replicated tensor on the mesh dimension

        # _partition_value is the conjugate operation of _reduce_value
-        # - i.e. _partition_value on a sum reduce op is just a divison operation
+        # - i.e. _partition_value on a sum reduce op is just a division operation
        # - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
        # TODO: if the reduce_op is min/max, etc. the _partition_value should be a
        # different operation