From 3e46d6c9e4b8692e087b4dd1af232f98b2e6e45e Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Tue, 11 May 2021 08:25:25 -0700 Subject: [PATCH] Update docs to mention CUDA support for Future (#50048) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50048 To reflect the many changes introduced recently. In my mind, CUDAFuture should be considered a "private" subclass, which in practice should always be returned as a downcast pointer to an ivalue::Future. Hence, we should document the CUDA behavior in the superclass, even if it's CUDA-agnostic, since that's the interface the users will see also for CUDA-enabled futures. ghstack-source-id: 128640983 Test Plan: Built locally and looked at them. Reviewed By: mrshenli Differential Revision: D25757474 fbshipit-source-id: c6f66ba88fa6c4fc33601f31136422d6cf147203 --- docs/source/futures.rst | 4 -- torch/_C/__init__.pyi.in | 1 + torch/futures/__init__.py | 104 ++++++++++++++++++++++++++++++++------ 3 files changed, 89 insertions(+), 20 deletions(-) diff --git a/docs/source/futures.rst b/docs/source/futures.rst index e2d68d7bd53..82925138934 100644 --- a/docs/source/futures.rst +++ b/docs/source/futures.rst @@ -5,10 +5,6 @@ torch.futures ============= -.. warning:: - The ``torch.futures`` package is experimental and subject to change. - - This package provides a :class:`~torch.futures.Future` type that encapsulates an asynchronous execution and a set of utility functions to simplify operations on :class:`~torch.futures.Future` objects. Currently, the diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 95aef12c932..2164ae167ca 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -147,6 +147,7 @@ class JITException: ... class Future(object): def __init__(self, devices: List[device]) -> None: ... def done(self) -> _bool: ... + def value(self) -> Any: ... def wait(self) -> Any: ... def add_done_callback(self, callback: Callable) -> None: ... def then(self, callback: Callable) -> Future: ... diff --git a/torch/futures/__init__.py b/torch/futures/__init__.py index 5230eb40c81..c25c92f139d 100644 --- a/torch/futures/__init__.py +++ b/torch/futures/__init__.py @@ -21,6 +21,8 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta): Wrapper around a ``torch._C.Future`` which encapsulates an asynchronous execution of a callable, e.g. :meth:`~torch.distributed.rpc.rpc_async`. It also exposes a set of APIs to add callback functions and set results. + + .. warning:: GPU support is a beta feature, subject to changes. """ def __init__(self, *, devices: Optional[List[Union[int, str, torch.device]]] = None): @@ -45,6 +47,12 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta): r""" Return ``True`` if this ``Future`` is done. A ``Future`` is done if it has a result or an exception. + + If the value contains tensors that reside on GPUs, ``Future.done()`` + will return ``True`` even if the asynchronous kernels that are + populating those tensors haven't yet completed running on the device, + because at such stage the result is already usable, provided one + performs the appropriate synchronizations (see :meth:`wait`). """ return super().done() @@ -52,6 +60,16 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta): r""" Block until the value of this ``Future`` is ready. + If the value contains tensors that reside on GPUs, then an additional + synchronization is performed with the kernels (executing on the device) + which may be asynchronously populating those tensors. Such sync is + non-blocking, which means that ``wait()`` will insert the necessary + instructions in the current streams to ensure that further operations + enqueued on those streams will be properly scheduled after the async + kernels but, once that is done, ``wait()`` will return, even if those + kernels are still running. No further synchronization is required when + accessing and using the values, as long as one doesn't change streams. + Returns: The value held by this ``Future``. If the function (callback or RPC) creating the value has thrown an error, this ``wait`` method will @@ -59,16 +77,56 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta): """ return super().wait() + def value(self) -> T: + r""" + Obtain the value of an already-completed future. + + This method should only be called after a call to :meth:`wait` has + completed, or inside a callback function passed to :meth:`then`. In + other cases this ``Future`` may not yet hold a value and calling + ``value()`` could fail. + + If the value contains tensors that reside on GPUs, then this method will + *not* perform any additional synchronization. This should be done + beforehand, separately, through a call to :meth:`wait` (except within + callbacks, for which it's already being taken care of by :meth:`then`). + + Returns: + The value held by this ``Future``. If the function (callback or RPC) + creating the value has thrown an error, this ``value()`` method will + also throw an error. + """ + return super().value() + # Have to use string annotations because PEP-0563 is not available in 3.6 def then(self, callback): # type: (Callable[[Future[T]], S]) -> Future[S] r""" Append the given callback function to this ``Future``, which will be run when the ``Future`` is completed. Multiple callbacks can be added to - the same ``Future``, and will be invoked in the same order as they were - added. The callback must take one argument, which is the reference to - this ``Future``. The callback function can use the ``Future.wait()`` API - to get the value. Note that if this ``Future`` is already completed, the - given callback will be run immediately inline. + the same ``Future``, but the order in which they will be executed cannot + be guaranteed (to enforce a certain order consider chaining: + ``fut.then(cb1).then(cb2)``). The callback must take one argument, which + is the reference to this ``Future``. The callback function can use the + :meth:`value` method to get the value. Note that if this ``Future`` is + already completed, the given callback will be run immediately inline. + + If the ``Future``'s value contains tensors that reside on GPUs, the + callback might be invoked while the async kernels that are populating + those tensors haven't yet finished executing on the device. However, the + callback will be invoked with some dedicated streams set as current + (fetched from a global pool) which will be synchronized with those + kernels. Hence any operation performed by the callback on these tensors + will be scheduled on the device after the kernels complete. In other + words, as long as the callback doesn't switch streams, it can safely + manipulate the result without any additional synchronization. This is + similar to the non-blocking behavior of :meth:`wait`. + + Similarly, if the callback returns a value that contains tensors that + reside on a GPU, it can do so even if the kernels that are producing + these tensors are still running on the device, as long as the callback + didn't change streams during its execution. If one wants to change + streams, one must be careful to re-synchronize them with the original + streams, that is, those that were current when the callback was invoked. Args: callback(``Callable``): a ``Callable`` that takes this ``Future`` as @@ -110,21 +168,24 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta): return cast(Future[S], super().then(callback)) # Have to use string annotations because PEP-0563 is not available in 3.6 - def _add_done_callback(self, callback): # type: (Callable[[Future[T]], None]) -> None + def add_done_callback(self, callback): # type: (Callable[[Future[T]], None]) -> None r""" Append the given callback function to this ``Future``, which will be run when the ``Future`` is completed. Multiple callbacks can be added to - the same ``Future``, and will be invoked in the same order as they were - added. The callback must take one argument, which is the reference to - this ``Future``. The callback function can use the ``Future.wait()`` API - to get the value. Note that if this ``Future`` is already completed, the - given callback will be run inline. + the same ``Future``, but the order in which they will be executed cannot + be guaranteed. The callback must take one argument, which is the + reference to this ``Future``. The callback function can use the + :meth:`value` method to get the value. Note that if this ``Future`` is + already completed, the given callback will be run inline. - We recommend that you use the ``then`` API as it provides a way to synchronize - after your callback has completed. ``add_done_callback`` can be cheaper if your - callback does not return anything. But both ``then`` and ``add_done_callback`` - use the same callback registration API under the hood, and thus the order of - their callbacks will be maintained even if their calls are interleaved. + We recommend that you use the :meth:`then` method as it provides a way + to synchronize after your callback has completed. ``add_done_callback`` + can be cheaper if your callback does not return anything. But both + :meth:`then` and ``add_done_callback`` use the same callback + registration API under the hood. + + With respect to GPU tensors, this method behaves in the same way as + :meth:`then`. Args: callback(``Future``): a ``Callable`` that takes in one argument, @@ -161,6 +222,17 @@ class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta): completed and trigger all attached callbacks. Note that a ``Future`` cannot be marked completed twice. + If the result contains tensors that reside on GPUs, this method can be + called even if the asynchronous kernels that are populating those + tensors haven't yet completed running on the device, provided that the + streams on which those kernels were enqueued are set as the current ones + when this method is called. Put simply, it's safe to call this method + immediately after launching those kernels, without any additional + synchronization, as long as one doesn't change streams in between. This + method will record events on all the relevant current streams and will + use them to ensure proper scheduling for all the consumers of this + ``Future``. + Args: result (object): the result object of this ``Future``.