Enable xdoctest runner in CI for real this time (#83816)

Builds on #83317 and enables running the doctests. Just need to figure out what is causing the failures. Pull Request resolved: https://github.com/pytorch/pytorch/pull/83816 Approved by: https://github.com/ezyang, https://github.com/malfet
2025-12-06 12:20:52 +01:00 · 2022-12-29 05:32:42 +00:00 · 2022-12-29 05:32:42 +00:00 · ad782ff7df
commit ad782ff7df
parent fb4fc0dabe
90 changed files with 456 additions and 262 deletions
--- a/.circleci/docker/requirements-ci.txt
+++ b/.circleci/docker/requirements-ci.txt
@ -179,9 +179,9 @@ pytest-rerunfailures
 #Pinned versions:
 #test that import:

-xdoctest==1.0.2
+xdoctest==1.1.0
 #Description: runs doctests in pytest
-#Pinned versions: 1.0.2
+#Pinned versions: 1.1.0
 #test that import:

 pygments==2.12.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -19,4 +19,4 @@ pytest-shard==0.1.2
 scipy==1.9.0
 sympy==1.11.1
 unittest-xml-reporting<=3.2.0,>=2.0.0
-xdoctest==1.0.2
+xdoctest==1.1.0
--- a/test/run_doctests.sh
+++ b/test/run_doctests.sh
@ -4,8 +4,11 @@ This script simply runs the torch doctests via the xdoctest runner.

 This must be run from the root of the torch repo, as it needs the path to the
 torch source code.
-"

+This script is provided as a developer convenience. On the CI the doctests are
+invoked in 'run_test.py'
+"
+# To simply list tests
 # xdoctest -m torch --style=google list

 # Reference: https://stackoverflow.com/questions/59895/bash-script-dir
@ -16,14 +19,10 @@ echo "TORCH_MODPATH = $TORCH_MODPATH"
 if [[ ! -d "$TORCH_MODPATH" ]] ; then
    echo "Could not find the path to the torch module"
 else
-
-    # Next version of xdoctest will support environment variables that overlo
-
-
    export XDOCTEST_GLOBAL_EXEC="from torch import nn\nimport torch.nn.functional as F\nimport torch"
    export XDOCTEST_OPTIONS="+IGNORE_WHITESPACE"
    # Note: google wont catch numpy style docstrings (a few exist) but it also wont fail
    # on things not intended to be doctests.
    export XDOCTEST_STYLE="google"
-    xdoctest "$TORCH_MODPATH" --style="$XDOCTEST_STYLE" --global-exec "$XDOCTEST_GLOBAL_EXEC" --options="$XDOCTEST_OPTIONS"
+    xdoctest torch "$TORCH_MODPATH" --style="$XDOCTEST_STYLE" --global-exec "$XDOCTEST_GLOBAL_EXEC" --options="$XDOCTEST_OPTIONS"
 fi
--- a/test/run_test.py
+++ b/test/run_test.py
@ -659,10 +659,9 @@ def run_doctests(test_module, test_directory, options):
    import pathlib
    pkgpath = pathlib.Path(torch.__file__).parent

-    #
    enabled = {
        # TODO: expose these options to the user
-        # Temporary disable all feature-conditional tests
+        # For now disable all feature-conditional tests
        # 'lapack': 'auto',
        # 'cuda': 'auto',
        # 'cuda1': 'auto',
@ -671,6 +670,9 @@ def run_doctests(test_module, test_directory, options):
        'cuda': 0,
        'cuda1': 0,
        'qengine': 0,
+        'autograd_profiler': 0,
+        'cpp_ext': 0,
+        'monitor': 0,
    }

    # Resolve "auto" based on a test to determine if the feature is available.
@ -707,13 +709,34 @@ def run_doctests(test_module, test_directory, options):
    if enabled['qengine']:
        os.environ['TORCH_DOCTEST_QENGINE'] = '1'

+    if enabled['autograd_profiler']:
+        os.environ['TORCH_DOCTEST_AUTOGRAD_PROFILER'] = '1'
+
+    if enabled['cpp_ext']:
+        os.environ['TORCH_DOCTEST_CPP_EXT'] = '1'
+
+    if enabled['monitor']:
+        os.environ['TORCH_DOCTEST_MONITOR'] = '1'
+
+    if 0:
+        # TODO: could try to enable some of these
+        os.environ['TORCH_DOCTEST_QUANTIZED_DYNAMIC'] = '1'
+        os.environ['TORCH_DOCTEST_ANOMOLY'] = '1'
+        os.environ['TORCH_DOCTEST_AUTOGRAD'] = '1'
+        os.environ['TORCH_DOCTEST_HUB'] = '1'
+        os.environ['TORCH_DOCTEST_DATALOADER'] = '1'
+        os.environ['TORCH_DOCTEST_ONNX'] = '1'
+        os.environ['TORCH_DOCTEST_FUTURES'] = '1'
+
    pkgpath = os.path.dirname(torch.__file__)
+
    xdoctest_config = {
        'global_exec': r'\n'.join([
            'from torch import nn',
            'import torch.nn.functional as F',
            'import torch',
        ]),
+        'analysis': 'static',  # set to "auto" to test doctests in compiled modules
        'style': 'google',
        'options': '+IGNORE_WHITESPACE',
    }
@ -1016,7 +1039,7 @@ def parse_args():
    )
    parser.add_argument(
        "--xdoctest-command",
-        default='list',
+        default='all',
        help=(
            "Control the specific doctest action. "
            "Use 'list' to simply parse doctests and check syntax. "
--- a/torch/init.py
+++ b/torch/init.py
@ -627,10 +627,10 @@ def use_deterministic_algorithms(mode, *, warn_only=False):

    Example::

+        >>> # xdoctest: +SKIP
        >>> torch.use_deterministic_algorithms(True)

        # Forward mode nondeterministic error
-        >>> # xdoctest: +SKIP
        >>> torch.randn(10, device='cuda').kthvalue(0)
        ...
        RuntimeError: kthvalue CUDA does not have a deterministic implementation...
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@ -251,6 +251,7 @@ def vjp(func: Callable, *primals, has_aux: bool = False):

        Case 2: Using ``vjp`` inside ``torch.no_grad`` context manager:

+            >>> # xdoctest: +SKIP(failing)
            >>> with torch.no_grad():
            >>>     vjp(f)(x)

@ -1286,6 +1287,7 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla

    Example of using ``grad``:

+        >>> # xdoctest: +SKIP
        >>> from torch.func import grad
        >>> x = torch.randn([])
        >>> cos_x = grad(lambda x: torch.sin(x))(x)
@ -1297,6 +1299,7 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla

    When composed with ``vmap``, ``grad`` can be used to compute per-sample-gradients:

+        >>> # xdoctest: +SKIP
        >>> from torch.func import grad, vmap
        >>> batch_size, feature_size = 3, 5
        >>>
@ -1317,6 +1320,7 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla

    Example of using ``grad`` with ``has_aux`` and ``argnums``:

+        >>> # xdoctest: +SKIP
        >>> from torch.func import grad
        >>> def my_loss_func(y, y_pred):
        >>>    loss_per_sample = (0.5 * y_pred - y) ** 2
@ -1327,13 +1331,14 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla
        >>> y_true = torch.rand(4)
        >>> y_preds = torch.rand(4, requires_grad=True)
        >>> out = fn(y_true, y_preds)
-        >>> > output is ((grads w.r.t y_true, grads w.r.t y_preds), (y_pred, loss_per_sample))
+        >>> # > output is ((grads w.r.t y_true, grads w.r.t y_preds), (y_pred, loss_per_sample))

    .. note::
        Using PyTorch ``torch.no_grad`` together with ``grad``.

        Case 1: Using ``torch.no_grad`` inside a function:

+            >>> # xdoctest: +SKIP
            >>> def f(x):
            >>>     with torch.no_grad():
            >>>         c = x ** 2
@ -1343,6 +1348,7 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla

        Case 2: Using ``grad`` inside ``torch.no_grad`` context manager:

+            >>> # xdoctest: +SKIP
            >>> with torch.no_grad():
            >>>     grad(f)(x)

@ -1433,11 +1439,12 @@ def functionalize(func: Callable, *, remove: str = 'mutations') -> Callable:

    Example::

+        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.fx.experimental.proxy_tensor import make_fx
        >>> from torch.func import functionalize
        >>>
-        >>> A function that uses mutations and views, but only on intermediate tensors.
+        >>> # A function that uses mutations and views, but only on intermediate tensors.
        >>> def f(a):
        ...     b = a + 1
        ...     c = b.view(-1)
@ -1490,17 +1497,17 @@ def functionalize(func: Callable, *, remove: str = 'mutations') -> Callable:
            return view_copy_1


-        >>> A function that mutates its input tensor
+        >>> # A function that mutates its input tensor
        >>> def f(a):
        ...     b = a.view(-1)
        ...     b.add_(1)
        ...     return a
        ...
        >>> f_no_mutations_and_views_traced = make_fx(functionalize(f, remove='mutations_and_views'))(inpt)
-        >>>
-        >>> All mutations and views have been removed,
-        >>> but there is an extra copy_ in the graph to correctly apply the mutation to the input
-        >>> after the function has completed.
+        >>> #
+        >>> # All mutations and views have been removed,
+        >>> # but there is an extra copy_ in the graph to correctly apply the mutation to the input
+        >>> # after the function has completed.
        >>> print(f_no_mutations_and_views_traced.code)


--- a/torch/_functorch/fx_minifier.py
+++ b/torch/_functorch/fx_minifier.py
@ -69,6 +69,7 @@ def minifier(fail_f: fx.GraphModule, inps, module_fails, dump_state: Callable =
    2. Delta Debugging: Tries replacing half of the graph with inputs. If fails,
        tries replacing quarter of the graph, etc.

+    >>> # xdoctest: +SKIP(failing)
    >>> failing_function = fx.symbolic_trace(f)
    >>> minimize(failing_function, [torch.randn(5)], lambda fx_g, inps: fx_g(*inps))

--- a/torch/_namedtensor_internals.py
+++ b/torch/_namedtensor_internals.py
@ -122,10 +122,12 @@ def update_names(tensor, names, rename_map, inplace):

    For example,
    ```
+    >>> # xdoctest: +SKIP
    >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
    >>> x.rename('...', 'height', 'width').names
    ('N', 'C', 'height', 'width')

+    >>> # xdoctest: +SKIP
    >>> x.rename('batch', '...', 'width').names
    ('batch', 'C', 'H', 'width')

@ -136,6 +138,7 @@ def update_names(tensor, names, rename_map, inplace):

    For example,
    ```
+    >>> # xdoctest: +SKIP
    >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
    >>> x.rename(W='width', H='height').names
    ('N', 'C', 'height', 'width')
--- a/torch/_prims_common/init.py
+++ b/torch/_prims_common/init.py
@ -1496,6 +1496,7 @@ def compute_required_storage_length(
    >>> compute_required_storage_length(t.shape, t.stride(), t.storage_offset())
    200

+    >>> # xdoctest: +SKIP(failing)
    >>> t2 = torch.empty_strided((1, 2, 3), (5, 7, 11))
    >>> size = compute_required_storage_length(t2.shape, t2.stride(), t2.storage_offset())
    >>> size == t.storage().size()
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -215,7 +215,6 @@ def _vector_str(self, indent, summarize, formatter1, formatter2=None):
    elements_per_line = max(
        1, int(math.floor((PRINT_OPTS.linewidth - indent) / (element_length)))
    )
-    # char_per_line = element_length * elements_per_line  # unused

    def _val_formatter(val, formatter1=formatter1, formatter2=formatter2):
        if formatter2 is not None:
--- a/torch/_vmap_internals.py
+++ b/torch/_vmap_internals.py
@ -9,6 +9,7 @@ from torch.utils._pytree import _broadcast_to_and_flatten, tree_flatten, tree_un
 in_dims_t = Union[int, Tuple]
 out_dims_t = Union[int, Tuple[int, ...]]

+
 # Checks that all args-to-be-batched have the same batch dim size
 def _validate_and_get_batch_size(
    flat_in_dims: List[Optional[int]], flat_args: List
--- a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@ -19,9 +19,9 @@ class LinearReLU(nnqd.Linear):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> m = nn.intrinsic.quantized.dynamic.LinearReLU(20, 30)
        >>> input = torch.randn(128, 20)
-        >>> # xdoctest: +SKIP
        >>> output = m(input)
        >>> print(output.size())
        torch.Size([128, 30])
--- a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@ -56,6 +56,7 @@ class LinearLeakyReLU(nnq.Linear):
        Same as torch.nn.quantized.Linear
        + negative_slope
    Examples::
+        >>> # xdoctest: +SKIP
        >>> m = nn.intrinsic.LinearLeakyReLU(20, 30, 0.01)
        >>> input = torch.randn(128, 20)
        >>> output = m(input)
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@ -15,6 +15,7 @@ import warnings

 __all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']

+
 class Conv1d(nnq.Conv1d):
    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.

@ -31,9 +32,9 @@ class Conv1d(nnq.Conv1d):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> m = nn.quantized.dynamic.Conv1d(16, 33, 3, stride=2)
        >>> input = torch.randn(20, 16, 100)
-        >>> # xdoctest: +SKIP
        >>> output = m(input)

    """
@ -102,6 +103,7 @@ class Conv2d(nnq.Conv2d):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> # With square kernels and equal stride
        >>> m = nn.quantized.dynamic.Conv2d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
@ -109,7 +111,6 @@ class Conv2d(nnq.Conv2d):
        >>> # non-square kernels and unequal stride and with padding and dilation
        >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
        >>> input = torch.randn(20, 16, 50, 100)
-        >>> # xdoctest: +SKIP
        >>> output = m(input)

    """
@ -167,6 +168,7 @@ class Conv3d(nnq.Conv3d):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> # With square kernels and equal stride
        >>> m = nn.quantized.dynamic.Conv3d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
@ -174,7 +176,6 @@ class Conv3d(nnq.Conv3d):
        >>> # non-square kernels and unequal stride and with padding and dilation
        >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2))
        >>> input = torch.randn(20, 16, 56, 56, 56)
-        >>> # xdoctest: +SKIP
        >>> output = m(input)

    """
@ -233,8 +234,8 @@ class ConvTranspose1d(nnq.ConvTranspose1d):

    Examples::

-        >>> # With square kernels and equal stride
        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
        >>> m = nndq.ConvTranspose1d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
        >>> m = nndq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
@ -294,11 +295,11 @@ class ConvTranspose2d(nnq.ConvTranspose2d):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> # With square kernels and equal stride
        >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
        >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-        >>> # xdoctest: +SKIP
        >>> output = m(input)
        >>> # exact output size can be also specified as an argument
        >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1)
@ -355,11 +356,11 @@ class ConvTranspose3d(nnq.ConvTranspose3d):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> # With cubic kernels and equal stride
        >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
        >>> # non-cubic kernels and unequal stride and with padding
        >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
-        >>> # xdoctest: +SKIP
        >>> output = m(input)
        >>> # exact output size can be also specified as an argument
        >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1)
--- a/torch/ao/nn/quantized/dynamic/modules/linear.py
+++ b/torch/ao/nn/quantized/dynamic/modules/linear.py
@ -7,6 +7,7 @@ __all__ = [
    "Linear",
 ]

+
 class Linear(nnq.Linear):
    r"""
    A dynamic quantized linear module with floating point tensor as inputs and outputs.
@ -25,9 +26,9 @@ class Linear(nnq.Linear):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> m = nn.quantized.dynamic.Linear(20, 30)
        >>> input = torch.randn(128, 20)
-        >>> # xdoctest: +SKIP
        >>> output = m(input)
        >>> print(output.size())
        torch.Size([128, 30])
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@ -11,13 +11,16 @@ from torch.ao.nn.quantized.modules.utils import _quantize_weight
 __all__ = ['pack_weight_bias', 'PackedParameter', 'RNNBase', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', 'LSTMCell',
           'GRUCell', "apply_permutation"]

+
 def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
    return tensor.index_select(dim, permutation)

+
 def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
    warnings.warn("apply_permutation is deprecated, please use tensor.index_select(dim, permutation) instead")
    return _apply_permutation(tensor, permutation, dim)

+
 def pack_weight_bias(qweight, bias, dtype):

    if dtype == torch.qint8:
@ -39,6 +42,7 @@ def pack_weight_bias(qweight, bias, dtype):

        return packed_weight

+
 class PackedParameter(torch.nn.Module):
    def __init__(self, param):
        super(PackedParameter, self).__init__()
@ -54,6 +58,7 @@ class PackedParameter(torch.nn.Module):
        super(PackedParameter, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                                           missing_keys, unexpected_keys, error_msgs)

+
 class RNNBase(torch.nn.Module):

    _FLOAT_MODULE = nn.RNNBase
@ -347,7 +352,6 @@ class RNNBase(torch.nn.Module):

        return qRNNBase

-
    def _weight_bias(self):
        # Returns a dict of weights and biases
        weight_bias_dict: Dict[str, Dict] = {'weight' : {}, 'bias' : {}}
@ -376,6 +380,7 @@ class RNNBase(torch.nn.Module):
    def get_bias(self):
        return self._weight_bias()['bias']

+
 class LSTM(RNNBase):
    r"""
    A dynamic quantized LSTM module with floating point tensor as inputs and outputs.
@ -384,6 +389,7 @@ class LSTM(RNNBase):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> rnn = nn.LSTM(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
@ -610,6 +616,7 @@ class GRU(RNNBase):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> rnn = nn.GRU(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
@ -922,6 +929,7 @@ class RNNCellBase(torch.nn.Module):
        super(RNNCellBase, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                                       missing_keys, unexpected_keys, error_msgs)

+
 class RNNCell(RNNCellBase):
    r"""An Elman RNN cell with tanh or ReLU non-linearity.
    A dynamic quantized RNNCell module with floating point tensor as inputs and outputs.
@ -930,6 +938,7 @@ class RNNCell(RNNCellBase):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> rnn = nn.RNNCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
@ -982,6 +991,7 @@ class LSTMCell(RNNCellBase):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> rnn = nn.LSTMCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
@ -1014,6 +1024,7 @@ class LSTMCell(RNNCellBase):
    def from_float(cls, mod):
        return super(LSTMCell, cls).from_float(mod)

+
 class GRUCell(RNNCellBase):
    r"""A gated recurrent unit (GRU) cell

@ -1023,6 +1034,7 @@ class GRUCell(RNNCellBase):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> rnn = nn.GRUCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@ -164,6 +164,7 @@ def conv1d(input, weight, bias,

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> from torch.ao.nn.quantized import functional as qF
        >>> filters = torch.randn(33, 16, 3, dtype=torch.float)
        >>> inputs = torch.randn(20, 16, 50, dtype=torch.float)
@ -223,6 +224,7 @@ def conv2d(input, weight, bias,

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> from torch.ao.nn.quantized import functional as qF
        >>> filters = torch.randn(8, 4, 3, 3, dtype=torch.float)
        >>> inputs = torch.randn(1, 4, 5, 5, dtype=torch.float)
@ -283,6 +285,7 @@ def conv3d(input, weight, bias, stride=1, padding=0, dilation=1, groups=1,

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> from torch.ao.nn.quantized import functional as qF
        >>> filters = torch.randn(8, 4, 3, 3, 3, dtype=torch.float)
        >>> inputs = torch.randn(1, 4, 5, 5, 5, dtype=torch.float)
--- a/torch/ao/nn/quantized/modules/conv.py
+++ b/torch/ao/nn/quantized/modules/conv.py
@ -293,6 +293,7 @@ class Conv1d(_ConvNd):

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> m = nn.quantized.Conv1d(16, 33, 3, stride=2)
        >>> input = torch.randn(20, 16, 100)
        >>> # quantize input to quint8
@ -400,6 +401,7 @@ class Conv2d(_ConvNd):

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> # With square kernels and equal stride
        >>> m = nn.quantized.Conv2d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
@ -498,6 +500,7 @@ class Conv3d(_ConvNd):

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> # With square kernels and equal stride
        >>> m = nn.quantized.Conv3d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@ -115,6 +115,7 @@ class Linear(WeightedQuantizedModule):

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
        >>> m = nn.quantized.Linear(20, 30)
        >>> input = torch.randn(128, 20)
        >>> # xdoctest: +SKIP
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@ -88,6 +88,7 @@ class DTypeConfig:

    Example usage::

+        >>> # xdoctest: +SKIP(failing)
        >>> dtype_config1 = DTypeConfig(
        ...     input_dtype=torch.quint8,
        ...     output_dtype=torch.quint8,
--- a/torch/ao/quantization/backend_config/onednn.py
+++ b/torch/ao/quantization/backend_config/onednn.py
@ -77,6 +77,7 @@ def _fuse_linear_bn_leaky_relu(is_qat, linear, bn, leaky_relu):
        bn: BatchNorm1d instance that needs to be fused with the linear layer
        leaky_relu: LeakyReLU instance that needs to be fused with the linear layer
    Examples::
+        >>> # xdoctest: +SKIP(failing)
        >>> m1 = nn.Linear(20, 10)
        >>> b1 = nn.BatchNorm1d(10)
        >>> lr = nn.LeakyReLU(0.01)
--- a/torch/autograd/anomaly_mode.py
+++ b/torch/autograd/anomaly_mode.py
@ -5,6 +5,7 @@ from typing import Any

 __all__ = ["detect_anomaly", "set_detect_anomaly"]

+
 class detect_anomaly(object):
    r"""Context-manager that enable anomaly detection for the autograd engine.

@ -22,6 +23,7 @@ class detect_anomaly(object):

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ANOMOLY)
        >>> import torch
        >>> from torch import autograd
        >>> class MyFunc(autograd.Function):
--- a/torch/autograd/forward_ad.py
+++ b/torch/autograd/forward_ad.py
@ -11,6 +11,7 @@ __all__ = ["UnpackedDualTensor", "enter_dual_level", "exit_dual_level", "make_du
 # Global variable used to make the python API simpler to use
 _current_level = -1

+
 def enter_dual_level():
    r"""Function that can be used to enter a new forward grad level.
    This level can be used to make and unpack dual Tensors to compute
@ -27,6 +28,7 @@ def enter_dual_level():
    _current_level = new_level
    return new_level

+
 def exit_dual_level(*, level=None):
    r"""Function that can be used to exit a forward grad level.
    This function deletes all the gradients associated with this
@ -44,6 +46,7 @@ def exit_dual_level(*, level=None):
    torch._C._exit_dual_level(level=level)
    _current_level = level - 1

+
 def make_dual(tensor, tangent, *, level=None):
    r"""Associates a tensor value with a forward gradient, the tangent, to create a
    "dual tensor", which is used to compute forward AD gradients.
@ -104,11 +107,13 @@ def make_dual(tensor, tangent, *, level=None):

 _UnpackedDualTensor = namedtuple('_UnpackedDualTensor', ['primal', 'tangent'])

+
 class UnpackedDualTensor(_UnpackedDualTensor):
    r"""Namedtuple returned by :func:`unpack_dual` containing the primal and tangent components of the dual tensor.
    See :func:`unpack_dual` for more details."""
    pass

+
 def unpack_dual(tensor, *, level=None):
    r"""Unpacks a "dual tensor" to get both its Tensor value and its forward AD gradient.
    The result is a namedtuple ``(primal, tangent)`` where ``primal`` is a view of
@ -139,6 +144,7 @@ def unpack_dual(tensor, *, level=None):

    return UnpackedDualTensor(primal, dual)

+
 class dual_level(_DecoratorContextManager):
    r"""Context-manager that enables forward AD. All forward AD computation must
    be performed in a ``dual_level`` context.
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@ -48,6 +48,7 @@ class FunctionCtx(object):
        See :ref:`extending-autograd` for more details on how to use this method.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
            >>> class Func(Function):
            >>>     @staticmethod
            >>>     def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int):
@ -139,6 +140,7 @@ class FunctionCtx(object):
        modification.

        Examples::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
            >>> class Inplace(Function):
            >>>     @staticmethod
            >>>     def forward(ctx, x):
@ -210,6 +212,7 @@ class FunctionCtx(object):
        prior to calling the :func:`backward` and :func:`jvp` methods.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
            >>> class SimpleFunc(Function):
            >>>     @staticmethod
            >>>     def forward(ctx, x):
@ -382,6 +385,7 @@ class Function(_SingleLevelFunction):

    Examples::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> class Exp(Function):
        >>>     @staticmethod
        >>>     def forward(ctx, i):
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@ -7,6 +7,7 @@ __all__ = ["vjp", "jvp", "jacobian", "hessian", "hvp", "vhp"]

 # Utility functions

+
 def _as_tuple_nocheck(x):
    if isinstance(x, tuple):
        return x
@ -15,6 +16,7 @@ def _as_tuple_nocheck(x):
    else:
        return x,

+
 def _as_tuple(inp, arg_name=None, fn_name=None):
    # Ensures that inp is a tuple of Tensors
    # Returns whether or not the original inp was a tuple and the tupled version of the input
@ -37,6 +39,7 @@ def _as_tuple(inp, arg_name=None, fn_name=None):

    return is_inp_tuple, inp

+
 def _tuple_postprocess(res, to_unpack):
    # Unpacks a potentially nested tuple of Tensors
    # to_unpack should be a single boolean or a tuple of two booleans.
@ -54,6 +57,7 @@ def _tuple_postprocess(res, to_unpack):
            res = res[0]
    return res

+
 def _grad_preprocess(inputs, create_graph, need_graph):
    # Preprocess the inputs to make sure they require gradient
    # inputs is a tuple of Tensors to preprocess
@ -88,6 +92,7 @@ def _grad_postprocess(inputs, create_graph):
    else:
        return tuple(_grad_postprocess(inp, create_graph) for inp in inputs)

+
 def _validate_v(v, other, is_other_tuple):
    # This assumes that other is the correct shape, and v should match
    # Both are assumed to be tuples of Tensors
@ -138,6 +143,7 @@ def _check_requires_grad(inputs, input_type, strict):
                                   " The outputs must be computed in a differentiable manner from the input"
                                   " when running in strict mode.".format(i))

+
 def _autograd_grad(outputs, inputs, grad_outputs=None, create_graph=False, retain_graph=None, is_grads_batched=False):
    # Version of autograd.grad that accepts `None` in outputs and do not compute gradients for them.
    # This has the extra constraint that inputs has to be a tuple
@ -162,6 +168,7 @@ def _autograd_grad(outputs, inputs, grad_outputs=None, create_graph=False, retai
                                   create_graph=create_graph, retain_graph=retain_graph,
                                   is_grads_batched=is_grads_batched)

+
 def _fill_in_zeros(grads, refs, strict, create_graph, stage):
    # Used to detect None in the grads and depending on the flags, either replace them
    # with Tensors full of 0s of the appropriate size based on the refs or raise an error.
@ -204,6 +211,7 @@ def _fill_in_zeros(grads, refs, strict, create_graph, stage):

    return res

+
 # Public API

 def vjp(func, inputs, v=None, create_graph=False, strict=False):
@ -238,6 +246,7 @@ def vjp(func, inputs, v=None, create_graph=False, strict=False):

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def exp_reducer(x):
        ...     return x.exp().sum(dim=1)
        >>> inputs = torch.rand(4, 4)
@ -335,6 +344,7 @@ def jvp(func, inputs, v=None, create_graph=False, strict=False):

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def exp_reducer(x):
        ...     return x.exp().sum(dim=1)
        >>> inputs = torch.rand(4, 4)
@ -536,6 +546,7 @@ def jacobian(func, inputs, create_graph=False, strict=False, vectorize=False, st

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def exp_reducer(x):
        ...     return x.exp().sum(dim=1)
        >>> inputs = torch.rand(2, 2)
@ -698,6 +709,7 @@ def jacobian(func, inputs, create_graph=False, strict=False, vectorize=False, st

        return _tuple_postprocess(jacobian, (is_outputs_tuple, is_inputs_tuple))

+
 def hessian(func, inputs, create_graph=False, strict=False, vectorize=False, outer_jacobian_strategy="reverse-mode"):
    r"""Function that computes the Hessian of a given scalar function.

@ -746,6 +758,7 @@ def hessian(func, inputs, create_graph=False, strict=False, vectorize=False, out

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def pow_reducer(x):
        ...     return x.pow(3).sum()
        >>> inputs = torch.rand(2, 2)
@ -849,6 +862,7 @@ def vhp(func, inputs, v=None, create_graph=False, strict=False):

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def pow_reducer(x):
        ...     return x.pow(3).sum()
        >>> inputs = torch.rand(2, 2)
@ -939,6 +953,7 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False):

    Example:

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def pow_reducer(x):
        ...     return x.pow(3).sum()
        >>> inputs = torch.rand(2, 2)
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@ -270,6 +270,7 @@ class inference_mode(_DecoratorContextManager):
        mode (bool): Flag whether to enable or disable inference mode

    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> import torch
        >>> x = torch.ones(1, 2, 3, requires_grad=True)
        >>> with torch.inference_mode():
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@ -48,6 +48,7 @@ class saved_tensors_hooks():

    Example::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> def pack_hook(x):
        ...     print("Packing", x)
        ...     return x
@ -107,6 +108,7 @@ class save_on_cpu(saved_tensors_hooks):
    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
        >>> a = torch.randn(5, requires_grad=True, device="cuda")
        >>> b = torch.randn(5, requires_grad=True, device="cuda")
        >>> c = torch.randn(5, requires_grad=True, device="cuda")
@ -160,6 +162,7 @@ def disable_saved_tensors_hooks(error_message):

    Example::

+        >>> # xdoctest: +SKIP(failing)
        >>> message = "saved tensors default hooks are disabled"
        >>> with torch.autograd.graph.disable_saved_tensors_hooks(message):
        ...     # Raises RuntimeError: saved tensors default hooks are disabled
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@ -121,6 +121,7 @@ class profile(object):

    Example:
        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
        >>> x = torch.randn((1, 1), requires_grad=True)
        >>> with torch.autograd.profiler.profile() as prof:
        >>>     for _ in range(100):  # any normal python code, really!
@ -453,6 +454,7 @@ class record_function(_ContextDecorator):
        non-distributed cases.

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
        >>> x = torch.randn((1, 1), requires_grad=True)
        >>> with torch.autograd.profiler.profile() as prof:
        ...     y = x ** 2
@ -578,6 +580,7 @@ class emit_itt(object):

    Example:
        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
        >>> with torch.autograd.profiler.emit_itt():
        ...     model(x)

@ -646,6 +649,7 @@ class emit_nvtx(object):

    Example:
        >>> # xdoctest: +SKIP("undefined variables")
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
        >>> with torch.cuda.profiler.profile():
        ...     model(x)  # Warmup CUDA memory allocator and profiler
        ...     with torch.autograd.profiler.emit_nvtx():
--- a/torch/cuda/jiterator.py
+++ b/torch/cuda/jiterator.py
@ -6,6 +6,7 @@ import re

 __all__ : List[str] = []

+
 class _CodeParser:
    def __init__(self, code_string: str):
        optional_ws = r"\s*"
@ -37,6 +38,7 @@ class _CodeParser:
        self.function_params = result["function_params"]
        self.function_body = result["function_body"]

+
 class _JittedFunction:
    def __init__(self, code_string: str, return_by_ref: bool, num_outputs: int, **kwargs):
        self.code_string = code_string
@ -135,6 +137,7 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:

    return _JittedFunction(code_string, return_by_ref=False, num_outputs=1, **kwargs)

+
 def _create_multi_output_jit_fn(code_string: str, num_outputs: int, **kwargs) -> Callable:
    """
    Create a jiterator-generated cuda kernel for an elementwise op that supports returning one or more outputs.
--- a/torch/distributed/_composable/_ddp.py
+++ b/torch/distributed/_composable/_ddp.py
@ -825,6 +825,7 @@ class DistributedDataParallel(Module):
        Example::
            Below is an example of a noop hook that returns the same tensor.

+            >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
            >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
            >>>     fut = torch.futures.Future()
            >>>     fut.set_result(bucket.buffer())
@ -837,6 +838,7 @@ class DistributedDataParallel(Module):
            Below is an example of a Parallel SGD algorithm where gradients are encoded before
            allreduce, and then decoded after allreduce.

+            >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
            >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
            >>>     encoded_tensor = encode(bucket.buffer()) # encode gradients
            >>>     fut = torch.distributed.all_reduce(encoded_tensor).get_future()
--- a/torch/distributed/_composable/checkpoint_activation.py
+++ b/torch/distributed/_composable/checkpoint_activation.py
@ -195,6 +195,7 @@ def checkpoint(module: nn.Module, *, use_reentrant: bool = True) -> nn.Module:
            autograd.

    Example::
+        >>> # xdoctest: +SKIP
        >>> import torch.nn as nn
        >>>
        >>> class MyModel(nn.Module):
--- a/torch/distributed/_composable/contract.py
+++ b/torch/distributed/_composable/contract.py
@ -41,6 +41,7 @@ def contract(state_cls: Type[_State] = _State):
    ``func.state(module)``.

    Example::
+        >>> # xdoctest: +SKIP
        >>> import torch.nn as nn
        >>>
        >>> class MyModel(nn.Module):
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@ -18,6 +18,7 @@ def replicate(
        module (torch.nn.Module): module to replicate

    Example::
+        >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
        >>> module = nn.Linear(3, 3)
        >>> replicate(module)
    """
--- a/torch/distributed/_shard/sharded_tensor/init.py
+++ b/torch/distributed/_shard/sharded_tensor/init.py
@ -427,6 +427,7 @@ def custom_sharded_op_impl(func):
    parameters, the function provided will be invoked for that operator.

    Example::
+        >>> # xdoctest: +SKIP
        >>> @custom_sharded_op_impl(torch.nn.functional.linear)
        >>> def my_custom_sharded_linear(types, args, kwargs, process_group):
        >>>     ...
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@ -805,9 +805,9 @@ class ShardedTensor(ShardedTensorBase):
                tensor stored in the current rank.

        Examples:
+            >>> # xdoctest: +SKIP
            >>> # All tensors below are of torch.int64 type.
            >>> # We have 2 process groups, 2 ranks.
-            >>> # xdoctest: +SKIP
            >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
            >>> local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2]))
            >>> local_tensor
@ -955,8 +955,8 @@ class ShardedTensor(ShardedTensorBase):
            A :class:`ShardedTensor` object whose local shards are resharded.

        Examples:
-            >>> # We have 2 process groups, 2 ranks.
            >>> # xdoctest: +SKIP
+            >>> # We have 2 process groups, 2 ranks.
            >>> tensor = torch.arange(4, dtype=torch.int64) + 1 + 2 * rank
            >>> tensor = torch.stack([tensor, tensor])
            >>> tensor
--- a/torch/distributed/_shard/sharding_plan/api.py
+++ b/torch/distributed/_shard/sharding_plan/api.py
@ -36,6 +36,7 @@ class ShardingPlan(object):
      Suppose we want to shard a module with two linear layers and then run it with DDP, we also
      want to convert the output of the second linear layer back to DDP, we can do it as follows:

+        >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
        >>> class MyModule(nn.Module):
        >>>     def __init__(self):
        >>>        super().__init__()
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@ -54,6 +54,7 @@ class MemoryTracker:

    Example usage:

+        >>> # xdoctest: +SKIP(failing)
        >>> net.cuda()
        >>> input = input.cuda()

--- a/torch/distributed/autograd/init.py
+++ b/torch/distributed/autograd/init.py
@ -25,6 +25,7 @@ if is_available():
        DistAutogradContext,
    )

+
 class context(object):
    '''
    Context object to wrap forward and backward passes when using
@ -35,8 +36,8 @@ class context(object):
    autograd pass.

    Example::
-        >>> import torch.distributed.autograd as dist_autograd
        >>> # xdoctest: +SKIP
+        >>> import torch.distributed.autograd as dist_autograd
        >>> with dist_autograd.context() as context_id:
        >>>     t1 = torch.rand((3, 3), requires_grad=True)
        >>>     t2 = torch.rand((3, 3), requires_grad=True)
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@ -202,6 +202,7 @@ def load_sharded_optimizer_state_dict(
    """
    Loads a state_dict to be used in conjuntion with FSDP sharded optimizer state.
    This is the current recommended way to checkpoint is FSDP
+    >>> # xdoctest: +SKIP
    >>> import torch.distributed.checkpoint as dist_cp
    >>> import spmd.checkpoint as sp_cp
    >>> # Save
@ -224,7 +225,7 @@ def load_sharded_optimizer_state_dict(
    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
    >>>     model_state_dict = model_tp.state_dict()
    >>>     checkpoint = {
-    >>>         "model" = model_state_dict
+    >>>         "model": model_state_dict
    >>>     }
    >>>     dist_cp.load_state_dict(
    >>>         state_dict=checkpoint,
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@ -1940,6 +1940,7 @@ def _tensor_to_object(tensor, tensor_size):
    buf = tensor.numpy().tobytes()[:tensor_size]
    return _unpickler(io.BytesIO(buf)).load()

+
 def _check_for_nccl_backend(group):
    pg = group or _get_default_group()
    # Gate PG wrapper check on Gloo availability.
@ -1954,6 +1955,7 @@ def _check_for_nccl_backend(group):
        pg.name() == Backend.NCCL
    )

+
@exception_handler
 def all_gather_object(object_list, obj, group=None):
    """
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@ -323,11 +323,10 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):

    Example::

+        >>> # xdoctest: +SKIP
        >>> import torch.nn as nn
        >>> from torch.distributed.optim import ZeroRedundancyOptimizer
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
-
-        >>> # xdoctest: +SKIP
        >>> model = nn.Sequential(*[nn.Linear(2000, 2000).to(rank) for _ in range(20)])
        >>> ddp = DDP(model, device_ids=[rank])
        >>> opt = ZeroRedundancyOptimizer(
--- a/torch/distributed/tensor/parallel/_utils.py
+++ b/torch/distributed/tensor/parallel/_utils.py
@ -30,10 +30,12 @@ def _prepare_input_validate(
        func (Callable): Same input function with validation logic added.

    Example::
+        >>> # xdoctest: +SKIP(failing)
        >>> @_prepare_input_validate
        >>> def make_input_shard_1d(args, kwargs):
        >>>   ...
        >>>
+        >>> # xdoctest: +SKIP(failing)
        >>> input = torch.rand(...)
        >>> dtensor = make_input_shard_1d(input, device_mesh, 1)
        >>> # This will call '_prepare_input_validate' first
@ -71,14 +73,18 @@ def _prepare_output_validate(
    Inject common validation logics for _prepare_output funcs via this
    decorator, including verifying that output needs to be a DTensor
    and only 1D Device Mesh is passed in.
+
    Example::
+        >>> # xdoctest: +SKIP(failing)
        >>> @_prepare_output_validate
        >>> def make_output_shard_1d(args, kwargs):
        >>>   ...
        >>>
+        >>> # xdoctest: +SKIP(failing)
        >>> dt = distribute(tensor, device_mesh, [Shard(0)])
        >>> make_output_shard_1d(dt, device_mesh, 1)
        >>> # This will call '_prepare_output_validate' first
+
    Args:
        _prepare_output_func (Callable): The func we want to inject the
            validation into.
--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@ -61,7 +61,7 @@ def parallelize_module(  # type: ignore[return]

    Example::
        >>> # xdoctest: +SKIP("distributed")
-        >>> from from torch.distributed._tensor.parallel import parallelize_module, PairwiseParallel
+        >>> from torch.distributed._tensor.parallel import parallelize_module, PairwiseParallel
        >>>
        >>> # Define the module.
        >>> m = Model(...)
--- a/torch/distributions/von_mises.py
+++ b/torch/distributions/von_mises.py
@ -8,6 +8,7 @@ from torch.distributions.utils import broadcast_all, lazy_property

 __all__ = ['VonMises']

+
 def _eval_poly(y, coef):
    coef = list(coef)
    result = coef.pop()
--- a/torch/functional.py
+++ b/torch/functional.py
@ -267,18 +267,18 @@ def einsum(*args: Any) -> Tensor:

    Examples::

-        >>> # trace
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # trace
        >>> torch.einsum('ii', torch.randn(4, 4))
        tensor(-1.2104)

-        >>> # diagonal
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # diagonal
        >>> torch.einsum('ii->i', torch.randn(4, 4))
        tensor([-0.1034,  0.7952, -0.2433,  0.4545])

-        >>> # outer product
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # outer product
        >>> x = torch.randn(5)
        >>> y = torch.randn(4)
        >>> torch.einsum('i,j->ij', x, y)
@ -288,8 +288,8 @@ def einsum(*args: Any) -> Tensor:
                [ 0.1713, -0.4291, -0.5802,  0.7350],
                [ 0.5704, -1.4290, -1.9323,  2.4480]])

-        >>> # batch matrix multiplication
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # batch matrix multiplication
        >>> As = torch.randn(3, 2, 5)
        >>> Bs = torch.randn(3, 5, 4)
        >>> torch.einsum('bij,bjk->bik', As, Bs)
@ -302,8 +302,8 @@ def einsum(*args: Any) -> Tensor:
                [[ 2.8153,  1.8787, -4.3839, -1.2112],
                [ 0.3728, -2.1131,  0.0921,  0.8305]]])

-        >>> # with sublist format and ellipsis
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # with sublist format and ellipsis
        >>> torch.einsum(As, [..., 0, 1], Bs, [..., 1, 2], [..., 0, 2])
        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
                [-1.6706, -0.8097, -0.8025, -2.1183]],
@ -1604,6 +1604,7 @@ def chain_matmul(*matrices, out=None):

    Example::

+        >>> # xdoctest: +SKIP
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
        >>> a = torch.randn(3, 4)
        >>> b = torch.randn(4, 5)
--- a/torch/futures/init.py
+++ b/torch/futures/init.py
@ -144,6 +144,7 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
            on those futures independently.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
            >>> def callback(fut):
            ...     print(f"RPC return value is {fut.wait()}.")
            >>> fut = torch.futures.Future()
@ -191,8 +192,9 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
            for handling completion/waiting on those futures independently.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
            >>> def callback(fut):
-            ...     print(f"This will run after the future has finished.")
+            ...     print("This will run after the future has finished.")
            ...     print(fut.wait())
            >>> fut = torch.futures.Future()
            >>> fut.add_done_callback(callback)
@ -223,6 +225,7 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
            result (object): the result object of this ``Future``.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
            >>> import threading
            >>> import time
            >>> def slow_set_future(fut, value):
@ -251,6 +254,7 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
            result (BaseException): the exception for this ``Future``.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
            >>> fut = torch.futures.Future()
            >>> fut.set_exception(ValueError("foo"))
            >>> fut.wait()
@ -281,6 +285,7 @@ def collect_all(futures: List[Future]) -> Future[List[Future]]:
        in Futures.

    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
        >>> fut0 = torch.futures.Future()
        >>> fut1 = torch.futures.Future()
        >>> fut = torch.futures.collect_all([fut0, fut1])
--- a/torch/fx/experimental/unification/match.py
+++ b/torch/fx/experimental/unification/match.py
@ -36,10 +36,11 @@ class Dispatcher(object):
            return self
        return _

+
 class VarDispatcher(Dispatcher):
    """ A dispatcher that calls functions with variable names
-    >>> d = VarDispatcher('d')
    >>> # xdoctest: +SKIP
+    >>> d = VarDispatcher('d')
    >>> x = var('x')
    >>> @d.register('inc', x)
    ... def f(x):
@ -58,8 +59,6 @@ class VarDispatcher(Dispatcher):
        return func(**d)


-
-
 global_namespace = {}  # type: ignore[var-annotated]


--- a/torch/fx/experimental/unification/more.py
+++ b/torch/fx/experimental/unification/more.py
@ -7,11 +7,11 @@ def unifiable(cls):
    This uses the type and __dict__ or __slots__ attributes to define the
    nature of the term
    See Also:
+    >>> # xdoctest: +SKIP
    >>> class A(object):
    ...     def __init__(self, a, b):
    ...         self.a = a
    ...         self.b = b
-    >>> # xdoctest: +SKIP
    >>> unifiable(A)
    <class 'unification.more.A'>
    >>> x = var('x')
@ -33,13 +33,13 @@ def unifiable(cls):

 def reify_object(o, s):
    """ Reify a Python object with a substitution
+    >>> # xdoctest: +SKIP
    >>> class Foo(object):
    ...     def __init__(self, a, b):
    ...         self.a = a
    ...         self.b = b
    ...     def __str__(self):
    ...         return "Foo(%s, %s)"%(str(self.a), str(self.b))
-    >>> # xdoctest: +SKIP
    >>> x = var('x')
    >>> f = Foo(1, x)
    >>> print(f)
@ -88,13 +88,13 @@ def _reify(o, s):
 def unify_object(u, v, s):
    """ Unify two Python objects
    Unifies their type and ``__dict__`` attributes
+    >>> # xdoctest: +SKIP
    >>> class Foo(object):
    ...     def __init__(self, a, b):
    ...         self.a = a
    ...         self.b = b
    ...     def __str__(self):
    ...         return "Foo(%s, %s)"%(str(self.a), str(self.b))
-    >>> # xdoctest: +SKIP
    >>> x = var('x')
    >>> f = Foo(1, x)
    >>> g = Foo(1, 2)
@ -110,6 +110,7 @@ def unify_object(u, v, s):
    else:
        return unify(u.__dict__, v.__dict__, s)

+
@dispatch(slice, slice, dict)
 def _unify(u, v, s):
    """ Unify a Python ``slice`` object """
--- a/torch/fx/experimental/unification/multipledispatch/core.py
+++ b/torch/fx/experimental/unification/multipledispatch/core.py
@ -13,14 +13,16 @@ def dispatch(*types, **kwargs):
    Collects implementations based on the function name.  Ignores namespaces.
    If ambiguous type signatures occur a warning is raised when the function is
    defined suggesting the additional method to break the ambiguity.
-    Examples
-    --------
+
+    Example:
+        >>> # xdoctest: +SKIP
        >>> @dispatch(int)
        ... def f(x):
        ...     return x + 1
        >>> @dispatch(float)
        ... def f(x):
        ...     return x - 1
+        >>> # xdoctest: +SKIP
        >>> f(3)
        4
        >>> f(3.0)
--- a/torch/fx/experimental/unification/multipledispatch/dispatcher.py
+++ b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@ -121,6 +121,7 @@ class Dispatcher(object):

    def register(self, *types, **kwargs):
        """ register dispatcher with new implementation
+        >>> # xdoctest: +SKIP
        >>> f = Dispatcher('f')
        >>> @f.register(int)
        ... def inc(x):
@ -172,6 +173,7 @@ class Dispatcher(object):

    def add(self, signature, func):
        """ Add new types/method pair to dispatcher
+        >>> # xdoctest: +SKIP
        >>> D = Dispatcher('add')
        >>> D.add((int, int), lambda x, y: x + y)
        >>> D.add((float, float), lambda x, y: x + y)
--- a/torch/fx/experimental/unification/multipledispatch/variadic.py
+++ b/torch/fx/experimental/unification/multipledispatch/variadic.py
@ -44,6 +44,7 @@ def isvariadic(obj):
        Whether or not `obj` is variadic
    Examples
    --------
+    >>> # xdoctest: +SKIP
    >>> isvariadic(int)
    False
    >>> isvariadic(Variadic[int])
@ -76,8 +77,8 @@ class Variadic(six.with_metaclass(VariadicSignatureMeta)):
    representing a specific variadic signature.
    Examples
    --------
-    >>> Variadic[int]  # any number of int arguments
    >>> # xdoctest: +SKIP
+    >>> Variadic[int]  # any number of int arguments
    <class 'multipledispatch.variadic.Variadic[int]'>
    >>> Variadic[(int, str)]  # any number of one of int or str arguments
    <class 'multipledispatch.variadic.Variadic[(int, str)]'>
--- a/torch/fx/experimental/unification/unification_tools.py
+++ b/torch/fx/experimental/unification/unification_tools.py
@ -7,6 +7,7 @@ __all__ = ('merge', 'merge_with', 'valmap', 'keymap', 'itemmap',
           'valfilter', 'keyfilter', 'itemfilter',
           'assoc', 'dissoc', 'assoc_in', 'update_in', 'get_in')

+
 def _get_factory(f, kwargs):
    factory = kwargs.pop('factory', dict)
    if kwargs:
@ -336,6 +337,7 @@ def get_in(keys, coll, default=None, no_default=False):
            raise
        return default

+
 def getter(index):
    if isinstance(index, list):
        if len(index) == 1:
@ -348,6 +350,7 @@ def getter(index):
    else:
        return operator.itemgetter(index)

+
 def groupby(key, seq):
    """ Group a collection by a key function

@ -383,6 +386,7 @@ def groupby(key, seq):
        rv[k] = v.__self__  # type: ignore[var-annotated, attr-defined]
    return rv

+
 def first(seq):
    """ The first element in a sequence

--- a/torch/fx/experimental/unification/utils.py
+++ b/torch/fx/experimental/unification/utils.py
@ -36,8 +36,8 @@ def _toposort(edges):
        edges - a dict of the form {a: {b, c}} where b and c depend on a
    outputs:
        L - an ordered list of nodes that satisfy the dependencies of edges
-    >>> _toposort({1: (2, 3), 2: (3, )})
    >>> # xdoctest: +SKIP
+    >>> _toposort({1: (2, 3), 2: (3, )})
    [1, 2, 3]
    Closely follows the wikipedia page [2]
    [1] Kahn, Arthur B. (1962), "Topological sorting of large networks",
--- a/torch/fx/experimental/unification/variable.py
+++ b/torch/fx/experimental/unification/variable.py
@ -36,6 +36,7 @@ class Var(object):
 def var():
    return lambda *args: Var(*args)

+
 def vars():
    return lambda n: [var() for i in range(n)]

@ -46,6 +47,7 @@ def isvar(v):

 isvar

+
@dispatch(object)  # type: ignore[no-redef]
 def isvar(o):
    return not not _glv and hashable(o) and o in _glv
@ -53,14 +55,17 @@ def isvar(o):

@contextmanager
 def variables(*variables):
-    """ Context manager for logic variables
+    """
+    Context manager for logic variables
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined vars")
        >>> from __future__ import with_statement
        >>> with variables(1):
        ...     print(isvar(1))
        True
        >>> print(isvar(1))
        False
-    >>> # xdoctest: +SKIP("undefined vars")
        >>> # Normal approach
        >>> from unification import unify
        >>> x = var('x')
--- a/torch/hub.py
+++ b/torch/hub.py
@ -388,6 +388,7 @@ def list(github, force_reload=False, skip_validation=False, trust_repo=None):
        list: The available callables entrypoint

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
        >>> entrypoints = torch.hub.list('pytorch/vision', force_reload=True)
    """
    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "list", verbose=True,
@ -440,6 +441,7 @@ def help(github, model, force_reload=False, skip_validation=False, trust_repo=No

            Default is ``None`` and will eventually change to ``"check"`` in v1.14.
    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
        >>> print(torch.hub.help('pytorch/vision', 'resnet18', force_reload=True))
    """
    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "help", verbose=True,
@ -519,6 +521,7 @@ def load(repo_or_dir, model, *args, source='github', trust_repo=None, force_relo
        ``*args`` and ``**kwargs``.

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
        >>> # from a github repo
        >>> repo = 'pytorch/vision'
        >>> model = torch.hub.load(repo, 'resnet50', weights='ResNet50_Weights.IMAGENET1K_V1')
@ -586,6 +589,7 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True):
            Default: True

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
        >>> # xdoctest: +REQUIRES(POSIX)
        >>> torch.hub.download_url_to_file('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', '/tmp/temporary_file')

@ -694,6 +698,7 @@ def load_state_dict_from_url(
        file_name (str, optional): name for the downloaded file. Filename from ``url`` will be used if not set.

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
        >>> state_dict = torch.hub.load_state_dict_from_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')

    """
--- a/torch/library.py
+++ b/torch/library.py
@ -14,6 +14,7 @@ _impls: Set[str] = set()
 # prim is reserved by TorchScript interpreter
 _reserved_namespaces = ['prim']

+
 class Library:
    """
    A class to create libraries that can be used to register new operators or
@ -57,6 +58,7 @@ class Library:
            name of the operator as inferred from the schema.

        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
            >>> my_lib = Library("foo", "DEF")
            >>> my_lib.define("sum(Tensor self) -> Tensor")
        '''
@ -105,7 +107,6 @@ class Library:
                               "'s behavior for {} dispatch key and {} namespace.".
                               format(name.split("::")[-1], dispatch_key, self.ns))

-
        if dispatch_key == "Meta":
            dispatcher_op_name = name
            if '::' not in dispatcher_op_name:
@ -135,6 +136,7 @@ class Library:
                _impls.remove(key)
            del self.m

+
 # decorator to register python functions for library ops
 # Note: this decorator API should remain consistent with `Library.impl` API
 def impl(lib, name, dispatch_key=""):
@ -143,6 +145,7 @@ def impl(lib, name, dispatch_key=""):
        return f
    return wrap

+
 def define(lib, schema, alias_analysis=""):
    def wrap(f):
        name = lib.define(schema, alias_analysis)
--- a/torch/monitor/init.py
+++ b/torch/monitor/init.py
@ -8,6 +8,7 @@ if TYPE_CHECKING:

 STAT_EVENT = "torch.monitor.Stat"

+
 class TensorboardEventHandler:
    """
    TensorboardEventHandler is an event handler that will write known events to
@ -16,6 +17,8 @@ class TensorboardEventHandler:
    This currently only supports ``torch.monitor.Stat`` events which are logged
    as scalars.

+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_MONITOR)
        >>> # xdoctest: +REQUIRES(module:tensorboard)
        >>> from torch.utils.tensorboard import SummaryWriter
        >>> from torch.monitor import TensorboardEventHandler, register_event_handler
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@ -14,6 +14,7 @@ __all__ = ['Threshold', 'ReLU', 'RReLU', 'Hardtanh', 'ReLU6', 'Sigmoid', 'Hardsi
           'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Tanhshrink',
           'Softmin', 'Softmax', 'Softmax2d', 'LogSoftmax']

+
 class Threshold(Module):
    r"""Thresholds each element of the input Tensor.

--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@ -625,6 +625,7 @@ class SyncBatchNorm(_BatchNorm):

    Examples::

+        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
@ -634,7 +635,6 @@ class SyncBatchNorm(_BatchNorm):
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
-        >>> # xdoctest: +SKIP
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@ -23,10 +23,12 @@ _rnn_impls = {
 def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
    return tensor.index_select(dim, permutation)

+
 def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
    warnings.warn("apply_permutation is deprecated, please use tensor.index_select(dim, permutation) instead")
    return _apply_permutation(tensor, permutation, dim)

+
 class RNNBase(Module):
    __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
                     'batch_first', 'dropout', 'bidirectional', 'proj_size']
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@ -4,6 +4,7 @@ from typing import List, Dict, Any

 __all__ = ['consume_prefix_in_state_dict_if_present']

+
 def _ntuple(n, name="parse"):
    def parse(x):
        if isinstance(x, collections.abc.Iterable):
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@ -1375,6 +1375,7 @@ class DistributedDataParallel(Module, Joinable):

        Example::

+            >>> # xdoctest: +SKIP("Distributed")
            >>> import torch
            >>> import torch.distributed as dist
            >>> import os
@ -1548,18 +1549,18 @@ class DistributedDataParallel(Module, Joinable):
        Example::
            Below is an example of a noop hook that returns the same tensor.

+            >>> # xdoctest: +SKIP('undefined name')
            >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
            >>>     fut = torch.futures.Future()
            >>>     fut.set_result(bucket.buffer())
            >>>     return fut
-
-            >>> # xdoctest: +SKIP('undefined name')
            >>> ddp.register_comm_hook(state=None, hook=noop)

        Example::
            Below is an example of a Parallel SGD algorithm where gradients are encoded before
            allreduce, and then decoded after allreduce.

+            >>> # xdoctest: +SKIP('undefined name')
            >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
            >>>     encoded_tensor = encode(bucket.buffer())  # encode gradients
            >>>     fut = torch.distributed.all_reduce(encoded_tensor).get_future()
@ -1568,8 +1569,6 @@ class DistributedDataParallel(Module, Joinable):
            >>>         decoded_tensor = decode(fut.value()[0])  # decode gradients
            >>>         return decoded_tensor
            >>>     return fut.then(decode)
-
-            >>> # xdoctest: +SKIP('undefined name')
            >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
        """
        self._check_comm_hook(hook)
--- a/torch/nn/utils/_expanded_weights/conv_utils.py
+++ b/torch/nn/utils/_expanded_weights/conv_utils.py
@ -9,6 +9,7 @@ from .expanded_weights_utils import \

 THRESHOLD = 32

+
 def conv_picker(func, conv1dOpt, conv2dOpt, conv3dOpt):
    if func == F.conv1d:
        return conv1dOpt
@ -18,6 +19,7 @@ def conv_picker(func, conv1dOpt, conv2dOpt, conv3dOpt):
        assert func == F.conv3d
        return conv3dOpt

+
 def conv_args_and_kwargs(kwarg_names, expanded_args_and_kwargs):
    args = expanded_args_and_kwargs[:len(expanded_args_and_kwargs) - len(kwarg_names)]
    kwargs = expanded_args_and_kwargs[len(expanded_args_and_kwargs) - len(kwarg_names):]
@ -25,6 +27,7 @@ def conv_args_and_kwargs(kwarg_names, expanded_args_and_kwargs):

    return conv_normalizer(*args, **kwargs)

+
 def conv_normalizer(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    return (input, weight), {'bias': bias, 'stride': stride, 'padding': padding, 'dilation': dilation, 'groups': groups}

@ -124,6 +127,7 @@ def conv_backward(func, ctx, grad_output):
    set_grad_sample_if_exists(ctx.bias, lambda _: grad_output.reshape(*grad_output.shape[:2], -1).sum(dim=2))
    return tuple(results)

+
 def conv_unfold_weight_grad_sample(input, grad_output, weight_shape, kernel_size, stride, padding, dilation, groups, func):
    n = input.shape[0]
    in_channels = input.shape[1]
@ -158,6 +162,7 @@ def conv_unfold_weight_grad_sample(input, grad_output, weight_shape, kernel_size
    weight_grad_sample = weight_grad_sample.view(shape)
    return weight_grad_sample

+
 def conv_group_weight_grad_sample(input, grad_output, weight_shape, stride, padding, dilation, batch_size, func):
    I = input.shape[1]
    O = grad_output.shape[1]
@ -195,9 +200,9 @@ def unfold3d(
        A tensor of shape ``(B, C * np.product(kernel_size), L)``, where L - output spatial dimensions.
        See :class:`torch.nn.Unfold` for more details
    Example:
+        >>> # xdoctest: +SKIP
        >>> B, C, D, H, W = 3, 4, 5, 6, 7
        >>> tensor = torch.arange(1, B * C * D * H * W + 1.).view(B, C, D, H, W)
-        >>> # xdoctest: +SKIP
        >>> unfold3d(tensor, kernel_size=2, padding=0, stride=1).shape
        torch.Size([3, 32, 120])
    """
--- a/torch/nn/utils/_per_sample_grad.py
+++ b/torch/nn/utils/_per_sample_grad.py
@ -6,6 +6,7 @@ from torch.nn.utils._expanded_weights.expanded_weights_impl import ExpandedWeigh

 from torch.utils._pytree import tree_flatten

+
 # dependency on `functional_call` means that this can't be exposed in utils
 # without creating circular dependency
 def call_for_per_sample_grads(module, *, batch_size=None, loss_reduction="sum"):
@ -28,17 +29,17 @@ def call_for_per_sample_grads(module, *, batch_size=None, loss_reduction="sum"):
          running mean across a batch. Must be "mean" or "sum". Default: "sum"

    Examples::
+        >>> # xdoctest: +SKIP
        >>> model = nn.Linear(4, 3)
        >>> batched_input = torch.randn(5, 4)  # batch size of 5
-        >>> # xdoctest: +SKIP
        >>> res = call_for_per_sample_grads(model)(batched_input).sum()
        >>> res.backward()
        >>> assert model.weight.shape == (3, 4)
        >>> assert model.weight.grad_sample.shape == (5, 3, 4)
-        >>> assert model.weight.grad == None
+        >>> assert model.weight.grad is None
        >>> assert model.bias.shape == (3,)
        >>> assert model.bias.grad_sample.shape == (5, 3)
-        >>> assert model.bias.grad == None
+        >>> assert model.bias.grad is None

    An example using "mean" loss reduction. The grad_sample fields will be scaled by batch_size from what they would be
    if we ran the same code with loss_reduction="sum". This is because the mean at the end will scale all
--- a/torch/nn/utils/init.py
+++ b/torch/nn/utils/init.py
@ -28,8 +28,8 @@ def skip_init(module_cls, *args, **kwargs):

    Example::

-        >>> import torch
        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> import torch
        >>> m = torch.nn.utils.skip_init(torch.nn.Linear, 5, 1)
        >>> m.weight
        Parameter containing:
--- a/torch/nn/utils/memory_format.py
+++ b/torch/nn/utils/memory_format.py
@ -1,5 +1,6 @@
 import torch

+
 def convert_conv2d_weight_memory_format(module, memory_format):
    r"""Convert ``memory_format`` of ``nn.Conv2d.weight`` to ``memory_format``
    The conversion recursively applies to nested ``nn.Module``, including ``module``.
@ -50,6 +51,7 @@ def convert_conv2d_weight_memory_format(module, memory_format):
        The original module with updated ``nn.Conv2d``

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
        >>> input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float16, device="cuda")
        >>> model = nn.Sequential(
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@ -1002,7 +1002,7 @@ def ln_structured(module, name, amount, n, dim, importance_scores=None):
        module (nn.Module): modified (i.e. pruned) version of the input module

    Examples:
-        >>> # xdoctest: +SKIP
+        >>> from torch.nn.utils import prune
        >>> m = prune.ln_structured(
        ...     nn.Conv2d(5, 3, 2), 'weight', amount=0.3, dim=1, n=float('-inf')
        ... )
@ -1055,7 +1055,8 @@ def global_unstructured(parameters, pruning_method, importance_scores=None, **kw
        scope of global pruning to unstructured methods.

    Examples:
-        >>> # xdoctest: +SKIP
+        >>> from torch.nn.utils import prune
+        >>> from collections import OrderedDict
        >>> net = nn.Sequential(OrderedDict([
        ...     ('first', nn.Linear(10, 4)),
        ...     ('second', nn.Linear(4, 1)),
@ -1070,7 +1071,7 @@ def global_unstructured(parameters, pruning_method, importance_scores=None, **kw
        ...     amount=10,
        ... )
        >>> print(sum(torch.nn.utils.parameters_to_vector(net.buffers()) == 0))
-        tensor(10, dtype=torch.uint8)
+        tensor(10)

    """
    # ensure parameters is a list or generator of tuples
@ -1156,7 +1157,7 @@ def custom_from_mask(module, name, mask):
        module (nn.Module): modified (i.e. pruned) version of the input module

    Examples:
-        >>> # xdoctest: +SKIP
+        >>> from torch.nn.utils import prune
        >>> m = prune.custom_from_mask(
        ...     nn.Linear(5, 3), name='bias', mask=torch.tensor([0, 1, 0])
        ... )
@ -1211,8 +1212,8 @@ def is_pruned(module):
        binary answer to whether ``module`` is pruned.

    Examples:
+        >>> from torch.nn.utils import prune
        >>> m = nn.Linear(5, 7)
-        >>> # xdoctest: +SKIP
        >>> print(prune.is_pruned(m))
        False
        >>> prune.random_unstructured(m, name='weight', amount=0.2)
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@ -20,6 +20,7 @@ PackedSequence_.__annotations__ = {'data': torch.Tensor, 'batch_sizes': torch.Te
                                   'sorted_indices': Optional[torch.Tensor],
                                   'unsorted_indices': Optional[torch.Tensor]}

+
 def bind(optional, fn):
    if optional is None:
        return None
--- a/torch/onnx/_internal/diagnostics/infra/engine.py
+++ b/torch/onnx/_internal/diagnostics/infra/engine.py
@ -18,7 +18,8 @@ class DiagnosticEngine:

    Examples:
        Step 1: Create a set of rules.
-        >>> rules = infra.RuleCollection.from_list(
+        >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
+        >>> rules = infra.RuleCollection.custom_collection_from_list(
        ...     "CustomRuleCollection",
        ...     [
        ...         infra.Rule(
@ -34,6 +35,7 @@ class DiagnosticEngine:

        Step 3: Start a new diagnostic context.
        >>> with engine.create_diagnostic_context("torch.onnx.export", version="1.0") as context:
+        ...     ...

        Step 4: Add diagnostics in your code.
        ...     context.diagnose(rules.rule1, infra.Level.ERROR)
--- a/torch/onnx/_type_utils.py
+++ b/torch/onnx/_type_utils.py
@ -63,6 +63,8 @@ class JitScalarType(enum.IntEnum):
    Use ``JitScalarType`` to convert from torch and JIT scalar types to ONNX scalar types.

    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> # xdoctest: +IGNORE_WANT("win32 has different output")
        >>> JitScalarType.from_value(torch.ones(1, 2)).onnx_type()
        TensorProtoDataType.FLOAT

--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@ -22,6 +22,7 @@ EPOCH_DEPRECATION_WARNING = (
    "https://github.com/pytorch/pytorch/issues/new/choose."
 )

+
 class LRScheduler(object):

    def __init__(self, optimizer, last_epoch=-1, verbose=False):
@ -196,10 +197,10 @@ class LambdaLR(LRScheduler):
            each update. Default: ``False``.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer has two groups.
        >>> lambda1 = lambda epoch: epoch // 30
        >>> lambda2 = lambda epoch: 0.95 ** epoch
-        >>> # xdoctest: +SKIP
        >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
        >>> for epoch in range(100):
        >>>     train(...)
@ -282,8 +283,8 @@ class MultiplicativeLR(LRScheduler):
            each update. Default: ``False``.

    Example:
-        >>> lmbda = lambda epoch: 0.95
        >>> # xdoctest: +SKIP
+        >>> lmbda = lambda epoch: 0.95
        >>> scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
        >>> for epoch in range(100):
        >>>     train(...)
@ -365,12 +366,12 @@ class StepLR(LRScheduler):
            each update. Default: ``False``.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer uses lr = 0.05 for all groups
        >>> # lr = 0.05     if epoch < 30
        >>> # lr = 0.005    if 30 <= epoch < 60
        >>> # lr = 0.0005   if 60 <= epoch < 90
        >>> # ...
-        >>> # xdoctest: +SKIP
        >>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
        >>> for epoch in range(100):
        >>>     train(...)
@ -414,11 +415,11 @@ class MultiStepLR(LRScheduler):
            each update. Default: ``False``.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer uses lr = 0.05 for all groups
        >>> # lr = 0.05     if epoch < 30
        >>> # lr = 0.005    if 30 <= epoch < 80
        >>> # lr = 0.0005   if epoch >= 80
-        >>> # xdoctest: +SKIP
        >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
        >>> for epoch in range(100):
        >>>     train(...)
@ -463,13 +464,13 @@ class ConstantLR(LRScheduler):
            each update. Default: ``False``.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer uses lr = 0.05 for all groups
        >>> # lr = 0.025   if epoch == 0
        >>> # lr = 0.025   if epoch == 1
        >>> # lr = 0.025   if epoch == 2
        >>> # lr = 0.025   if epoch == 3
        >>> # lr = 0.05    if epoch >= 4
-        >>> # xdoctest: +SKIP
        >>> scheduler = ConstantLR(self.opt, factor=0.5, total_iters=4)
        >>> for epoch in range(100):
        >>>     train(...)
@ -525,13 +526,13 @@ class LinearLR(LRScheduler):
            each update. Default: ``False``.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer uses lr = 0.05 for all groups
        >>> # lr = 0.025    if epoch == 0
        >>> # lr = 0.03125  if epoch == 1
        >>> # lr = 0.0375   if epoch == 2
        >>> # lr = 0.04375  if epoch == 3
        >>> # lr = 0.05    if epoch >= 4
-        >>> # xdoctest: +SKIP
        >>> scheduler = LinearLR(self.opt, start_factor=0.5, total_iters=4)
        >>> for epoch in range(100):
        >>>     train(...)
@ -617,13 +618,13 @@ class SequentialLR(LRScheduler):
        verbose (bool): Does nothing.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer uses lr = 1. for all groups
        >>> # lr = 0.1     if epoch == 0
        >>> # lr = 0.1     if epoch == 1
        >>> # lr = 0.9     if epoch == 2
        >>> # lr = 0.81    if epoch == 3
        >>> # lr = 0.729   if epoch == 4
-        >>> # xdoctest: +SKIP
        >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2)
        >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
        >>> scheduler = SequentialLR(self.opt, schedulers=[scheduler1, scheduler2], milestones=[2])
@ -670,7 +671,6 @@ class SequentialLR(LRScheduler):

        self._last_lr = schedulers[0].get_last_lr()

-
    def step(self):
        self.last_epoch += 1
        idx = bisect_right(self._milestones, self.last_epoch)
@ -726,13 +726,13 @@ class PolynomialLR(LRScheduler):
            each update. Default: ``False``.

    Example:
+        >>> # xdoctest: +SKIP("undefined vars")
        >>> # Assuming optimizer uses lr = 0.001 for all groups
        >>> # lr = 0.001     if epoch == 0
        >>> # lr = 0.00075   if epoch == 1
        >>> # lr = 0.00050   if epoch == 2
        >>> # lr = 0.00025   if epoch == 3
        >>> # lr = 0.0       if epoch >= 4
-        >>> # xdoctest: +SKIP("undefined vars")
        >>> scheduler = PolynomialLR(self.opt, total_iters=4, power=1.0)
        >>> for epoch in range(100):
        >>>     train(...)
@ -846,13 +846,13 @@ class ChainedScheduler(LRScheduler):
        schedulers (list): List of chained schedulers.

    Example:
+        >>> # xdoctest: +SKIP
        >>> # Assuming optimizer uses lr = 1. for all groups
        >>> # lr = 0.09     if epoch == 0
        >>> # lr = 0.081    if epoch == 1
        >>> # lr = 0.729    if epoch == 2
        >>> # lr = 0.6561   if epoch == 3
        >>> # lr = 0.59049  if epoch >= 4
-        >>> # xdoctest: +SKIP
        >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2)
        >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
        >>> scheduler = ChainedScheduler([scheduler1, scheduler2])
@ -1544,8 +1544,8 @@ class OneCycleLR(LRScheduler):
            each update. Default: ``False``.

    Example:
-        >>> data_loader = torch.utils.data.DataLoader(...)
        >>> # xdoctest: +SKIP
+        >>> data_loader = torch.utils.data.DataLoader(...)
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(data_loader), epochs=10)
        >>> for epoch in range(10):
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@ -9,6 +9,7 @@ from torch.optim.lr_scheduler import LRScheduler

 __all__ = ['AveragedModel', 'update_bn', 'SWALR']

+
 class AveragedModel(Module):
    r"""Implements averaged model for Stochastic Weight Averaging (SWA).

--- a/torch/serialization.py
+++ b/torch/serialization.py
@ -49,6 +49,7 @@ __all__ = [
    'StorageType',
 ]

+
 class SourceChangeWarning(Warning):
    pass

@ -186,10 +187,12 @@ def _cuda_deserialize(obj, location):
        else:
            return obj.cuda(device)

+
 def _mps_deserialize(obj, location):
    if location == 'mps':
        return obj.mps()

+
 def _meta_deserialize(obj, location):
    if location == 'meta':
        return torch.UntypedStorage(obj.nbytes(), device='meta')
@ -356,6 +359,7 @@ def _check_seekable(f) -> bool:
        raise_err_msg(["seek", "tell"], e)
    return False

+
 def _check_dill_version(pickle_module) -> None:
    '''Checks if using dill as the pickle module, and if so, checks if it is the correct version.
    If dill version is lower than 0.3.1, a ValueError is raised.
@ -375,12 +379,14 @@ def _check_dill_version(pickle_module) -> None:
                pickle_module.__version__
            ))

+
 def _check_save_filelike(f):
    if not isinstance(f, (str, os.PathLike)) and not hasattr(f, 'write'):
        raise AttributeError((
            "expected 'f' to be string, path, or a file-like object with "
            "a 'write' attribute"))

+
 def save(
    obj: object,
    f: FILE_LIKE,
@ -420,6 +426,7 @@ def save(
        to use the old format, pass the kwarg ``_use_new_zipfile_serialization=False``.

    Example:
+        >>> # xdoctest: +SKIP("makes cwd dirty")
        >>> # Save to file
        >>> x = torch.tensor([0, 1, 2, 3, 4])
        >>> torch.save(x, 'tensor.pt')
@ -1087,6 +1094,7 @@ def _get_restore_location(map_location):
            return result
    return restore_location

+
 class StorageType():
    def __init__(self, name):
        self.dtype = _get_dtype_from_pickle_storage_type(name)
@ -1094,6 +1102,7 @@ class StorageType():
    def __str__(self):
        return f'StorageType(dtype={self.dtype})'

+
 def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickle_load_args):
    restore_location = _get_restore_location(map_location)

--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@ -90,6 +90,8 @@ def make_tensor(
        TypeError: If :attr:`dtype` isn't supported by this function.

    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> from torch.testing import make_tensor
        >>> # Creates a float tensor with values in [-1, 1)
        >>> make_tensor((3,), device='cpu', dtype=torch.float32, low=-1, high=1)
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@ -54,6 +54,7 @@ def skip_unless_torch_gpu(method: T) -> T:
    """
    Test decorator which skips the test unless there's a GPU available to torch.

+    >>> # xdoctest: +SKIP
    >>> @skip_unless_torch_gpu
    >>> def test_some_method(self) -> None:
    >>>   ...
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@ -22,6 +22,7 @@ def rename_privateuse1_backend(backend_name: str) -> None:

    Example::

+        >>> # xdoctest: +SKIP("failing")
        >>> torch.register_privateuse1_backend("foo")
        # This will work, assuming that you've implemented the right C++ kernels
        # to implement torch.ones.
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@ -912,6 +912,7 @@ def CppExtension(name, sources, *args, **kwargs):

    Example:
        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
        >>> from setuptools import setup
        >>> from torch.utils.cpp_extension import BuildExtension, CppExtension
        >>> setup(
@ -959,6 +960,7 @@ def CUDAExtension(name, sources, *args, **kwargs):

    Example:
        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
        >>> from setuptools import setup
        >>> from torch.utils.cpp_extension import BuildExtension, CUDAExtension
        >>> setup(
@ -1006,14 +1008,12 @@ def CUDAExtension(name, sources, *args, **kwargs):
    To workaround the issue, move python binding logic to pure C++ file.

    Example use:
-        >>> # xdoctest: +SKIP
-        >>> #include <ATen/ATen.h>
-        >>> at::Tensor SigmoidAlphaBlendForwardCuda(....)
+        #include <ATen/ATen.h>
+        at::Tensor SigmoidAlphaBlendForwardCuda(....)

    Instead of:
-        >>> # xdoctest: +SKIP
-        >>> #include <torch/extension.h>
-        >>> torch::Tensor SigmoidAlphaBlendForwardCuda(...)
+        #include <torch/extension.h>
+        torch::Tensor SigmoidAlphaBlendForwardCuda(...)

    Currently open issue for nvcc bug: https://github.com/pytorch/pytorch/issues/69460
    Complete workaround code example: https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48
@ -1037,6 +1037,7 @@ def CUDAExtension(name, sources, *args, **kwargs):

    Example:
        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
        >>> CUDAExtension(
        ...        name='cuda_extension',
        ...        sources=['extension.cpp', 'extension_kernel.cu'],
@ -1362,6 +1363,7 @@ def load_inline(name,
            causes issues.

    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
        >>> from torch.utils.cpp_extension import load_inline
        >>> source = """
        at::Tensor sin_add(at::Tensor x, at::Tensor y) {
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@ -33,11 +33,11 @@ def default_convert(data):
            data: a single data point to be converted

        Examples:
+            >>> # xdoctest: +SKIP
            >>> # Example with `int`
            >>> default_convert(0)
            0
            >>> # Example with NumPy array
-            >>> # xdoctest: +SKIP
            >>> default_convert(np.array([0, 1]))
            tensor([0, 1])
            >>> # Example with NamedTuple
@ -228,6 +228,7 @@ def default_collate(batch):
            batch: a single batch to be collated

        Examples:
+            >>> # xdoctest: +SKIP
            >>> # Example with a batch of `int`s:
            >>> default_collate([0, 1, 2, 3])
            tensor([0, 1, 2, 3])
@ -238,7 +239,6 @@ def default_collate(batch):
            >>> default_collate([{'A': 0, 'B': 1}, {'A': 100, 'B': 100}])
            {'A': tensor([  0, 100]), 'B': tensor([  1, 100])}
            >>> # Example with `NamedTuple` inside the batch:
-            >>> # xdoctest: +SKIP
            >>> Point = namedtuple('Point', ['x', 'y'])
            >>> default_collate([Point(0, 0), Point(1, 1)])
            Point(x=tensor([0, 1]), y=tensor([0, 1]))
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@ -183,7 +183,9 @@ class CollatorIterDataPipe(MapperIterDataPipe):
        collate_fn: Customized collate function to collect and combine data or a batch of data.
            Default function collates to Tensor(s) based on data type.

-    Example: Convert integer data to float Tensor
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Convert integer data to float Tensor
        >>> class MyIterDataPipe(torch.utils.data.IterDataPipe):
        ...     def __init__(self, start, end):
        ...         super(MyIterDataPipe).__init__()
@ -203,7 +205,6 @@ class CollatorIterDataPipe(MapperIterDataPipe):
        >>> def collate_fn(batch):
        ...     return torch.tensor(batch, dtype=torch.float)
        ...
-        >>> # xdoctest: +SKIP
        >>> collated_ds = CollateIterDataPipe(ds, collate_fn=collate_fn)
        >>> print(list(collated_ds))
        [tensor(3.), tensor(4.), tensor(5.), tensor(6.)]
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@ -30,6 +30,7 @@ def validate_input_col(fn: Callable, input_col: Optional[Union[int, tuple, list]
    keyword-only arguments.

    Examples:
+        >>> # xdoctest: +SKIP("Failing on some CI machines")
        >>> def f(a, b, *, c=1):
        >>>     return a + b + c
        >>> def f_def(a, b=1, *, c=1):
@ -117,6 +118,7 @@ def _is_local_fn(fn):
            return "<locals>" in fn_type.__qualname__
    return False

+
 def _check_unpickable_fn(fn: Callable):
    """
    Checks function is pickable or not. If it is a lambda or local function, a UserWarning
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@ -81,6 +81,8 @@ class IterableDataset(Dataset[T_co]):

    Example 1: splitting workload across all workers in :meth:`__iter__`::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_DATALOADER)
+        >>> # xdoctest: +SKIP("Fails on MacOS12")
        >>> class MyIterableDataset(torch.utils.data.IterableDataset):
        ...     def __init__(self, start, end):
        ...         super(MyIterableDataset).__init__()
@ -122,6 +124,7 @@ class IterableDataset(Dataset[T_co]):

    Example 2: splitting workload across all workers using :attr:`worker_init_fn`::

+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_DATALOADER)
        >>> class MyIterableDataset(torch.utils.data.IterableDataset):
        ...     def __init__(self, start, end):
        ...         super(MyIterableDataset).__init__()
@ -313,9 +316,12 @@ def random_split(dataset: Dataset[T], lengths: Sequence[Union[int, float]],

    Optionally fix the generator for reproducible results, e.g.:

-    >>> random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))
-    >>> random_split(range(30), [0.3, 0.3, 0.4], generator=torch.Generator(
-    ...   ).manual_seed(42))
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> generator1 = torch.Generator().manual_seed(42)
+        >>> generator2 = torch.Generator().manual_seed(42)
+        >>> random_split(range(10), [3, 7], generator=generator1)
+        >>> random_split(range(30), [0.3, 0.3, 0.4], generator=generator2)

    Args:
        dataset (Dataset): Dataset to be split
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@ -53,6 +53,7 @@ __all__ = ['InputError', 'openf', 'bcolors', 'GeneratedFileCleaner', 'match_exte
           'is_caffe2_gpu_file', 'Trie', 'preprocessor', 'file_specific_replacement', 'file_add_header',
           'fix_static_global_kernels', 'extract_arguments', 'str2bool', 'hipify']

+
 class InputError(Exception):
    # Exception raised for errors in the input.

@ -79,6 +80,7 @@ class bcolors:
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

+
 # To the programmer, the output of hipify most likely are intermediates.
 # This class allows users of hipify to ask for a cleanup by running the
 # hipify and compilation in a with instantiating this context manager class
@ -119,13 +121,16 @@ class GeneratedFileCleaner:
            for d in self.dirs_to_clean[::-1]:
                os.rmdir(d)

+
 def match_extensions(filename: str, extensions: Iterable) -> bool:
    """Helper method to see if filename ends with certain extension"""
    return any(filename.endswith(e) for e in extensions)

+
 def _fnmatch(filepath, patterns):
    return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)

+
 def matched_files_iter(
        root_path: str,
        includes: Iterable = (),
@ -407,10 +412,8 @@ def find_closure_group(input_string, start, group):
    find_closure_group returns the positions of group[0] and group[1] as a tuple.

    Example:
-        find_closure_group("(hi)", 0, ["(", ")"])
-
-    Returns:
-        0, 3
+        >>> find_closure_group("(hi)", 0, ["(", ")"])
+        (0, 3)
    """

    inside_parenthesis = False
@ -522,7 +525,7 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
    """
    # At the moment, some PyTorch source files are HIPified in place.  The predicate
    # is_out_of_place tells us if this is the case or not.
-    assert(not os.path.isabs(rel_filepath))
+    assert not os.path.isabs(rel_filepath)
    if not is_pytorch_extension and not is_out_of_place(rel_filepath):
        return rel_filepath

@ -589,7 +592,7 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):


 def is_out_of_place(rel_filepath):
-    assert(not os.path.isabs(rel_filepath))
+    assert not os.path.isabs(rel_filepath)
    if rel_filepath.startswith("torch/"):
        return False
    if rel_filepath.startswith("tools/autograd/templates/"):
@ -599,7 +602,7 @@ def is_out_of_place(rel_filepath):

 # Keep this synchronized with includes/ignores in build_amd.py
 def is_pytorch_file(rel_filepath):
-    assert(not os.path.isabs(rel_filepath))
+    assert not os.path.isabs(rel_filepath)
    if rel_filepath.startswith("aten/"):
        if rel_filepath.startswith("aten/src/ATen/core/"):
            return False
@ -616,8 +619,9 @@ def is_cusparse_file(rel_filepath):
        return "sparse" in rel_filepath.lower()
    return False

+
 def is_caffe2_gpu_file(rel_filepath):
-    assert(not os.path.isabs(rel_filepath))
+    assert not os.path.isabs(rel_filepath)
    if rel_filepath.startswith("c10/cuda"):
        return True
    filename = os.path.basename(rel_filepath)
@ -732,6 +736,8 @@ Returns a dict with the following keys:
                      "skipped" if an identical hipified file already existed or hipified file couldn't be written out
                      "ignored" if the source file was a hipified file itself or not meant to be hipified
 """
+
+
 def preprocessor(
        output_directory: str,
        filepath: str,
@ -885,6 +891,7 @@ def preprocessor(
    else:
        return {"hipified_path": fout_path, "status": "[skipped, already hipified]"}

+
 def file_specific_replacement(filepath, search_string, replace_string, strict=False):
    with openf(filepath, "r+") as f:
        contents = f.read()