From 3e6bb5233f9ca2c5aa55d9cda22a7ee85439aa6e Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Wed, 16 Sep 2020 11:29:55 -0700
Subject: [PATCH] Reference amp tutorial (recipe) from core amp docs (#44725)

Summary:
https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html is live.  Core amp docs should reference it.

Also i fixed some typos in the `zero_grad` docs we ignored when git was behaving weirdly during ngimel 's merge of https://github.com/pytorch/pytorch/pull/44423.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44725

Reviewed By: mruberry

Differential Revision: D23723807

Pulled By: ngimel

fbshipit-source-id: ca0b76365f8ca908bd978e3b38bf81857fa6c2a3
---
 docs/source/amp.rst                |  3 ++-
 docs/source/notes/amp_examples.rst |  4 ++++
 torch/nn/modules/module.py         |  6 +++---
 torch/optim/optimizer.py           | 14 +++++++-------
 4 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/docs/source/amp.rst b/docs/source/amp.rst
index 553a9b6e539..986e7fe3529 100644
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@@ -14,7 +14,8 @@ are much faster in ``float16``. Other ops, like reductions, often require the dy
 range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype.
 
 Ordinarily, "automatic mixed precision training" uses :class:`torch.cuda.amp.autocast` and
-:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`Automatic Mixed Precision examples<amp-examples>`.
+:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`Automatic Mixed Precision examples<amp-examples>`
+and `Automatic Mixed Precision recipe <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_.
 However, :class:`autocast` and :class:`GradScaler` are modular, and may be used separately if desired.
 
 .. contents:: :local:
diff --git a/docs/source/notes/amp_examples.rst b/docs/source/notes/amp_examples.rst
index 3e2000cb5ff..b5614e02001 100644
--- a/docs/source/notes/amp_examples.rst
+++ b/docs/source/notes/amp_examples.rst
@@ -19,6 +19,10 @@ gradients by minimizing gradient underflow, as explained :ref:`here<gradient-sca
 :class:`torch.cuda.amp.autocast` and :class:`torch.cuda.amp.GradScaler` are modular.
 In the samples below, each is used as its individual documentation suggests.
 
+(Samples here are illustrative.  See the
+`Automatic Mixed Precision recipe <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_
+for a runnable walkthrough.)
+
 .. contents:: :local:
 
 Typical Mixed Precision Training
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index cdd9e1cf1ac..30e732e6d85 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -1315,11 +1315,11 @@ class Module:
 
     def zero_grad(self, set_to_none: bool = False) -> None:
         r"""Sets gradients of all model parameters to zero. See similar function
-        under `torch.optimizer` for more contexts.
+        under :class:`torch.optim.Optimizer` for more context.
 
         Arguments:
-            set_to_none (bool): instead of setting to zero, set the grad to None.
-                See :meth:`torch.optim.optimizer.zero_grad` for details.
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                See :meth:`torch.optim.Optimizer.zero_grad` for details.
         """
         if getattr(self, '_is_replica', False):
             warnings.warn(
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index f44116c9170..7d413b95941 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -165,18 +165,18 @@ class Optimizer(object):
         self.__setstate__({'state': state, 'param_groups': param_groups})
 
     def zero_grad(self, set_to_none: bool = False):
-        r"""Set the gradients of all optimized :class:`torch.Tensor` s to zero.
+        r"""Sets the gradients of all optimized :class:`torch.Tensor` s to zero.
 
         Arguments:
-            set_to_none (bool): instead of setting to zero, set the grad to None.
+            set_to_none (bool): instead of setting to zero, set the grads to None.
                 This is will in general have lower memory footprint, and can modestly improve performance.
                 However, it changes certain behaviors. For example:
-                1. When user tries to access the gradient value and perform manual ops on it.
-                A None attribute or a Tensor full of 0s will be different.
-                2. If the user requests `zero_grad(set_to_none=True)` followed by a backward pass, `.grad` s
+                1. When the user tries to access a gradient and perform manual ops on it,
+                a None attribute or a Tensor full of 0s will behave differently.
+                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
                 are guaranteed to be None for params that did not receive a gradient.
-                3. `torch.optim` optimizers have a different behavior if the gradient is 0 or None
-                (in one case it does the step with a gradient of 0 and in the other it skip
+                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
+                (in one case it does the step with a gradient of 0 and in the other it skips
                 the step altogether).
         """
         for group in self.param_groups: