From b51f92ebdab28fda309d4b21635add72b7381015 Mon Sep 17 00:00:00 2001
From: zhouzaida <zhouzaida@163.com>
Date: Fri, 28 Apr 2023 01:10:03 +0000
Subject: [PATCH] [Docs] Fix docstring format (#99396)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/99396
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/api.py                        | 12 ++++++------
 .../distributed/fsdp/fully_sharded_data_parallel.py  | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
index b9deb7aa606..32981834e55 100644
--- a/torch/distributed/fsdp/api.py
+++ b/torch/distributed/fsdp/api.py
@@ -52,13 +52,13 @@ class ShardingStrategy(Enum):
       synchronizes them (via all-reduce) after the backward computation. The
       unsharded optimizer states are updated locally per rank.
     - ``HYBRID_SHARD``: Apply ``FULL_SHARD`` within a node, and replicate parameters across
-        nodes. This results in reduced communication volume as expensive all-gathers and
-        reduce-scatters are only done within a node, which can be more performant for medium
-        -sized models.
+      nodes. This results in reduced communication volume as expensive all-gathers and
+      reduce-scatters are only done within a node, which can be more performant for medium
+      -sized models.
     - ``_HYBRID_SHARD_ZERO2``: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across
-        nodes. This is like ``HYBRID_SHARD``, except this may provide even higher throughput
-        since the unsharded parameters are not freed after the forward pass, saving the
-        all-gathers in the pre-backward.
+      nodes. This is like ``HYBRID_SHARD``, except this may provide even higher throughput
+      since the unsharded parameters are not freed after the forward pass, saving the
+      all-gathers in the pre-backward.
     """
 
     FULL_SHARD = auto()
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index cc6ed0c0f9a..4558425c5f3 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -216,7 +216,7 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
     Args:
         module (nn.Module):
             This is the module to be wrapped with FSDP.
-        process_group: Optional[Union[ProcessGroup, Tuple[ProcessGroup, ProcessGroup]]]
+        process_group (Optional[Union[ProcessGroup, Tuple[ProcessGroup, ProcessGroup]]]):
             This is the process group used for collective communications and
             the one over which the model is sharded. For hybrid sharding strategies such as
             ``ShardingStrategy.HYBRID_SHARD`` users can
@@ -1458,9 +1458,9 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
                 corresponding to the unflattened parameters and holding the
                 sharded optimizer state.
             model (torch.nn.Module):
-                Refer to :meth:``shard_full_optim_state_dict``.
+                Refer to :meth:`shard_full_optim_state_dict`.
             optim (torch.optim.Optimizer): Optimizer for ``model`` 's
-            parameters.
+                parameters.
 
         Returns:
             Refer to :meth:`shard_full_optim_state_dict`.
@@ -1785,7 +1785,7 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
     ) -> Dict[str, Any]:
         """
         This hook is intended be used by ``torch.distributed.NamedOptimizer``.
-        The functionality is identical to ``:meth:optim_state_dict`` except
+        The functionality is identical to :meth:`optim_state_dict` except
         for the different arguments.
 
         Args:
@@ -1916,7 +1916,7 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
     ) -> Dict[str, Any]:
         """
         This hook is intended be used by ``torch.distributed.NamedOptimizer``.
-        The functionality is identical to ``:meth:optim_state_dict_to_load``
+        The functionality is identical to :meth:`optim_state_dict_to_load`
         except for the different arguments.
 
         Args: