mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
[BE][15/16] fix typos in torch/ (torch/distributed/tensor/) (#156605)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/156605 Approved by: https://github.com/wanchaol, https://github.com/albanD
This commit is contained in:
parent
eeda1a75ac
commit
3f8e2e91ad
|
|
@ -1169,7 +1169,6 @@ exclude_patterns = [
|
|||
'aten/src/ATen/[a-mA-M]*/**',
|
||||
'test/**',
|
||||
'test/[a-hA-h]*/**',
|
||||
'torch/distributed/tensor/**',
|
||||
]
|
||||
init_command = [
|
||||
'python3',
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ class UtilTest(DTensorTestBase):
|
|||
)
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
"Non-sharded dimentions should have identical size across ranks.",
|
||||
"Non-sharded dimensions should have identical size across ranks.",
|
||||
):
|
||||
_ = compute_global_tensor_shape(
|
||||
local_shape,
|
||||
|
|
|
|||
|
|
@ -4,9 +4,8 @@ BU
|
|||
contiguities
|
||||
contiguity
|
||||
coo
|
||||
Din
|
||||
Dout
|
||||
dOut
|
||||
din
|
||||
dout
|
||||
ElementE
|
||||
followings
|
||||
fro
|
||||
|
|
|
|||
|
|
@ -571,7 +571,7 @@ class DTensor(torch.Tensor):
|
|||
"""
|
||||
Return the full tensor of this DTensor. It will perform necessary collectives
|
||||
to gather the local tensors from other ranks in its DeviceMesh and concatenate
|
||||
them together. It's a syntatic sugar of the following code:
|
||||
them together. It's a syntactic sugar of the following code:
|
||||
|
||||
``dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()``
|
||||
|
||||
|
|
@ -1011,7 +1011,7 @@ def _dtensor_init_helper( # type: ignore[no-untyped-def]
|
|||
# set default placements to replicated if not specified
|
||||
placements = placements or tuple(Replicate() for _ in range(device_mesh.ndim))
|
||||
|
||||
# check device_mesh againts placements
|
||||
# check device_mesh against placements
|
||||
assert device_mesh.ndim == len(placements), (
|
||||
"mesh dimension does not match the length of placements"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -316,7 +316,7 @@ def redistribute_cost(
|
|||
|
||||
NOTE:
|
||||
1. Only consider communication cost here, since computation costs for redistribute
|
||||
are quite trival (i.e. we only need to narrow or simple division)
|
||||
are quite trivial (i.e. we only need to narrow or simple division)
|
||||
2. Only consider redistribute cost on same mesh, cross mesh communication cost is
|
||||
not quite needed for operator strategy estimation/selection.
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -434,7 +434,7 @@ class OpDispatcher:
|
|||
"Found a non-scalar tensor with numel=1 and ndim!=0, "
|
||||
"we are implicitly creating a replicated DTensor for it. "
|
||||
"However, please consider changing it to a scalar tensor "
|
||||
"or explicitly create a DTensor under distributed enviroment."
|
||||
"or explicitly create a DTensor under distributed environment."
|
||||
)
|
||||
|
||||
if tensor_arg.numel() == 1 or self._allow_implicit_replication:
|
||||
|
|
|
|||
|
|
@ -244,7 +244,7 @@ class DTensorSpec:
|
|||
if placement.is_shard():
|
||||
placement = cast(Shard, placement)
|
||||
raise RuntimeError(
|
||||
f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
|
||||
f"DeviceMesh dimension can't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
|
||||
)
|
||||
elif placement.is_partial():
|
||||
raise RuntimeError(
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ KwargsType = dict[str, object]
|
|||
|
||||
PlacementList = list[Optional[Placement]]
|
||||
|
||||
# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
|
||||
# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type should
|
||||
# be the same set of possibilities.
|
||||
OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]
|
||||
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ class _MaskPartial(Partial):
|
|||
def _reduce_value(
|
||||
self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
|
||||
) -> torch.Tensor:
|
||||
# by the time we ned reduction, we should have already saved the mask
|
||||
# by the time we need reduction, we should have already saved the mask
|
||||
assert self.mask_buffer.data is not None
|
||||
|
||||
# apply the mask to the tensor that pending reduction
|
||||
|
|
@ -134,7 +134,7 @@ class _MaskPartial(Partial):
|
|||
mesh_dim: int,
|
||||
shard_spec: Placement,
|
||||
) -> torch.Tensor:
|
||||
# by the time we ned reduction, we should have already saved the mask
|
||||
# by the time we need reduction, we should have already saved the mask
|
||||
assert self.mask_buffer.data is not None
|
||||
|
||||
# apply the mask to the tensor that pending reduction
|
||||
|
|
|
|||
|
|
@ -1085,7 +1085,7 @@ def topk_strategy(op_schema: OpSchema) -> OpStrategy:
|
|||
if dim != topk_dim:
|
||||
dim_shardings: PlacementList = [Shard(dim)] * 3
|
||||
single_mesh_dim_strategies.append(dim_shardings)
|
||||
# TODO: topk on sharded dim requries non-trival reduction, address it later
|
||||
# TODO: topk on sharded dim requires non-trival reduction, address it later
|
||||
|
||||
return expand_to_full_mesh_op_strategy(
|
||||
input_strategy.mesh, op_schema, single_mesh_dim_strategies, input_index=2
|
||||
|
|
|
|||
|
|
@ -704,7 +704,7 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
|
|||
None, # max_k
|
||||
None, # philox_seed
|
||||
None, # philox_offset
|
||||
# NOTE: debug_attn_mask is not supproted by pytorch and is always an empty tensor
|
||||
# NOTE: debug_attn_mask is not supported by pytorch and is always an empty tensor
|
||||
# https://github.com/pytorch/pytorch/blob/60205b0eb2602317856312a66d955c88334ade0b/aten/src/ATen/native/transformers/cuda/attention.cu#L839-L840
|
||||
debug_attn_mask_sharding, # debug_attn_mask
|
||||
Replicate(), # q
|
||||
|
|
|
|||
|
|
@ -300,7 +300,7 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
|
|||
Flatten((InputDim(1), InputDim(2)))
|
||||
)
|
||||
|
||||
- ouptut dimension 0 maps to input dimension 0
|
||||
- output dimension 0 maps to input dimension 0
|
||||
- output dimension 1 maps to a flattened input dimensions 1 and 2
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -216,7 +216,7 @@ def map_placements_after_broadcast(
|
|||
# the input shape shard dim before broadcasting,
|
||||
# in this case it means implicit broadcasting happen
|
||||
# in this dim, so we can just mark it as replicate
|
||||
# and implict broadcast will broadcast automatically
|
||||
# and implicit broadcast will broadcast automatically
|
||||
# to the sharded shape
|
||||
new_placements.append(Replicate())
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ aten = torch.ops.aten
|
|||
class LocalShardsWrapper(torch.Tensor):
|
||||
"""
|
||||
A wrapper class to hold local shards of a DTensor.
|
||||
This class is used largely for checkpointing purposes and implicity subtypes
|
||||
This class is used largely for checkpointing purposes and implicitly subtypes
|
||||
the _Checkpointable protocol.
|
||||
"""
|
||||
|
||||
|
|
@ -159,7 +159,7 @@ class LocalShardsWrapper(torch.Tensor):
|
|||
]
|
||||
elif args[0].local_shards()[0].ndim == 1:
|
||||
assert args[0].storage_metadata().size[0] == view_shape[0]
|
||||
# This case is for optimizer sharding as regardles of sharding type, optimizer state is row wise sharded
|
||||
# This case is for optimizer sharding as regardless of sharding type, optimizer state is row wise sharded
|
||||
res_shards_list = [
|
||||
aten.view.default(shard, shard.shape, **kwargs)
|
||||
for shard in args[0].local_shards()
|
||||
|
|
|
|||
|
|
@ -296,7 +296,7 @@ def compute_global_tensor_shape(
|
|||
for shape_tensor in gathered_shaped_tensors:
|
||||
if not torch.equal(local_shape[other_dims], shape_tensor[other_dims]):
|
||||
raise RuntimeError(
|
||||
"Non-sharded dimentions should have identical size across ranks."
|
||||
"Non-sharded dimensions should have identical size across ranks."
|
||||
)
|
||||
shape_tensor_list = shape_tensor.tolist()
|
||||
sharded_dim_sum += shape_tensor_list[shard_dim]
|
||||
|
|
|
|||
|
|
@ -395,7 +395,7 @@ class CommDebugMode(TorchDispatchMode):
|
|||
json_dict: dict[str, Any] = {}
|
||||
add_json_information(json_dict, "Global")
|
||||
|
||||
# converts dictonary into json file
|
||||
# converts dictionary into json file
|
||||
with open(file_name, "w") as json_file:
|
||||
json.dump(json_dict, json_file, indent=4)
|
||||
|
||||
|
|
|
|||
|
|
@ -711,7 +711,7 @@ class CommDebugModeExample:
|
|||
|
||||
def run_example(world_size: int, rank: int, example_name: str) -> None:
|
||||
# set manual seed
|
||||
# intializing class with all of the functions
|
||||
# initializing class with all of the functions
|
||||
instantiated_example = CommDebugModeExample(world_size, rank)
|
||||
# dict that stores example code function names
|
||||
name_to_example_code: dict[str, Callable[[], None]] = {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# mypy: allow-untyped-defs
|
||||
"""
|
||||
The following example demonstrates how to train a ConvNeXt model
|
||||
with intermediate activations sharded across mutliple GPUs via DTensor
|
||||
with intermediate activations sharded across multiple GPUs via DTensor
|
||||
|
||||
To run the example, use the following command:
|
||||
torchrun --standalone --nnodes=1 --nproc-per-node=4 convnext_example.py
|
||||
|
|
|
|||
|
|
@ -231,7 +231,7 @@ def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):
|
|||
|
||||
# note: for uneven sharding, we need to specify the shape and stride because
|
||||
# DTensor would assume even sharding and compute shape/stride based on the
|
||||
# assumption. Torchrec needs to pass in this information explicitely.
|
||||
# assumption. Torchrec needs to pass in this information explicitly.
|
||||
# shape/stride are global tensor's shape and stride
|
||||
dtensor = DTensor.from_local(
|
||||
local_shards_wrapper, # a torch.Tensor subclass
|
||||
|
|
@ -324,7 +324,7 @@ def run_torchrec_table_wise_sharding_example(rank, world_size):
|
|||
# create a DTensor from the local shard for the current table
|
||||
# note: for uneven sharding, we need to specify the shape and stride because
|
||||
# DTensor would assume even sharding and compute shape/stride based on the
|
||||
# assumption. Torchrec needs to pass in this information explicitely.
|
||||
# assumption. Torchrec needs to pass in this information explicitly.
|
||||
dtensor = DTensor.from_local(
|
||||
local_shards,
|
||||
device_submesh,
|
||||
|
|
|
|||
|
|
@ -239,7 +239,7 @@ class _AllToAllRotater(_RingRotater):
|
|||
|
||||
class _AllGatherRotater(_RingRotater):
|
||||
"""
|
||||
Allgather the kv and return the only the requried kv.
|
||||
Allgather the kv and return the only the required kv.
|
||||
Only one communication will be done.
|
||||
"""
|
||||
|
||||
|
|
@ -277,7 +277,7 @@ def _create_rotater(
|
|||
elif method == _RotateMethod.ALL_GATHER:
|
||||
return _AllGatherRotater(pg, seq_dim)
|
||||
else:
|
||||
raise NotImplementedError(f"Unkonwn method {method}")
|
||||
raise NotImplementedError(f"Unknown method {method}")
|
||||
|
||||
|
||||
def _templated_ring_attention(
|
||||
|
|
@ -339,12 +339,12 @@ def _templated_ring_attention(
|
|||
|
||||
First Iteration: Both ranks perform SDPA with their local qkv pairs, similar to the
|
||||
no-load-balance case. This iteration corresponds to the `if` of the
|
||||
(`if, `elif`, `else`) in the implemementation.
|
||||
(`if, `elif`, `else`) in the implementation.
|
||||
|
||||
Second Iteration: Rank0 now has (q0, q3) and (k1, k2); rank1 has (q1, q2) and
|
||||
(k0, k3). For rank0, no computation is needed for q0. However, computations for
|
||||
q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
|
||||
`else` of the (`if`, `elif`, `else`) in the implemementation.
|
||||
`else` of the (`if`, `elif`, `else`) in the implementation.
|
||||
For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
|
||||
corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.
|
||||
|
||||
|
|
@ -916,7 +916,7 @@ def _distribute_function(
|
|||
the inputs and outputs of a function. Similar to ``distribute_module``, this API
|
||||
installs hooks to the ``fn`` to convert the inputs and outputs. There are two
|
||||
major differences between ``distribute_function`` and ``distribute_module``.
|
||||
First, a function does not have parammeters and buffers, as a result,
|
||||
First, a function does not have parameters and buffers, as a result,
|
||||
``distribute_function`` itself won't convert any parameters/buffers but simply
|
||||
install the input and output hooks. The tensor conversion will happen in the hooks.
|
||||
Another difference is an nn.Module subclass can have several instances and each
|
||||
|
|
@ -932,9 +932,9 @@ def _distribute_function(
|
|||
``fn_module`` is ``torch.nn.functional``.
|
||||
device_mesh (:class:`DeviceMesh`): the device mesh that will be used by the
|
||||
input and output hooks to distribute the tensors.
|
||||
input_fn (Optioinal[Callable]): the hook to distribute or convert the input
|
||||
input_fn (Optional[Callable]): the hook to distribute or convert the input
|
||||
arguments of ``fn``.
|
||||
output_fn (Optioinal[Callable]): the hook to distribute or convert the output
|
||||
output_fn (Optional[Callable]): the hook to distribute or convert the output
|
||||
arguments of ``fn``.
|
||||
"""
|
||||
|
||||
|
|
@ -989,7 +989,7 @@ class _AttentionContextParallel(ParallelStyle):
|
|||
Applies context parallel optimizations to the attention layer.
|
||||
|
||||
This will work for nn.MultiHeadedAttention and custom attention layers that
|
||||
call F.scaled_dotproduct_attention with a simliar signature.
|
||||
call F.scaled_dotproduct_attention with a similar signature.
|
||||
|
||||
This expects the `forward` method consumes either:
|
||||
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ def local_map(
|
|||
>>> row_wise = [Shard(0)] # row-wise sharding placements on 1-d mesh
|
||||
>>> col_wise = [Shard(1)] # col-wise sharding placements on 1-d mesh
|
||||
>>>
|
||||
>>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor convertion
|
||||
>>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor conversion
|
||||
>>> local_mm_allreduce_forward = local_map(
|
||||
>>> mm_allreduce_forward,
|
||||
>>> out_placements=[Replicate()],
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ def register_sharding(op: Union[OpOverload, list[OpOverload]]):
|
|||
as the original op (except that if an arg is a :class:`torch.Tensor`, it will be
|
||||
replaced by a tensor-like object that DTensor uses internally). The function should
|
||||
return a sequence of 2-tuples, each specifying acceptable output placements and its
|
||||
corresponding intput placements.
|
||||
corresponding input placements.
|
||||
|
||||
Example:
|
||||
>>> # xdoctest: +SKIP("distributed")
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ def _flatten_tensor(
|
|||
|
||||
@no_type_check
|
||||
def _unflatten_tensor(tensor, spec, *, device_handle=None, compute_stream=None):
|
||||
# unflatten would mainly be called everytime FSDP allgather parameters.
|
||||
# unflatten would mainly be called every time FSDP allgather parameters.
|
||||
result = DTensor.from_local(
|
||||
tensor,
|
||||
spec.mesh,
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ def _update_module_param(param_list: list[tuple[nn.Module, str, nn.Parameter]]):
|
|||
|
||||
def _reconstruct_dtensor(module: nn.Module, _input: Any):
|
||||
"""
|
||||
Recontruct DTensor parameters from local tensors
|
||||
Reconstruct DTensor parameters from local tensors
|
||||
"""
|
||||
param_list = []
|
||||
# TODO: To add perf optimizations to this iterations
|
||||
|
|
|
|||
|
|
@ -326,7 +326,7 @@ class DTensorExtensions(FSDPExtensions):
|
|||
super().__init__()
|
||||
self.compute_stream = None
|
||||
self.device_handle = device_handle
|
||||
# we have to use the dynamo disable this way to disable dynamo as the decorater way would
|
||||
# we have to use the dynamo disable this way to disable dynamo as the decorator way would
|
||||
# trigger build failure with torch deploy...
|
||||
self.post_unflatten_transform = torch._dynamo.disable( # type: ignore[method-assign]
|
||||
self.post_unflatten_transform
|
||||
|
|
|
|||
|
|
@ -701,7 +701,7 @@ class Partial(Placement):
|
|||
# _partition_value: partition the value of a replicated tensor on the mesh dimension
|
||||
|
||||
# _partition_value is the conjugate operation of _reduce_value
|
||||
# - i.e. _partition_value on a sum reduce op is just a divison operation
|
||||
# - i.e. _partition_value on a sum reduce op is just a division operation
|
||||
# - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
|
||||
# TODO: if the reduce_op is min/max, etc. the _partition_value should be a
|
||||
# different operation
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user