[distributed] Make rref_proxy._invoke_rpc trully async when needed. (#70206)

Summary: From https://github.com/pytorch/pytorch/issues/67626: RRefProxy (rref.rpc_async, rref.rpc_sync, rref.remote) currently uses a blocking RPC call to the owner This is done by chaining async calls. In the sync case we wait on the resulting Future. Pull Request resolved: https://github.com/pytorch/pytorch/pull/70206 Test Plan: I ran rpc_tests using tensorpipe_rpc_agent_test_fixture.py and had to adjust test_rref_proxy_timeout to the new behavior. I ran into test_tensorpipe_set_default_timeout failing due to the timeout being too small. Doesn't look related to this change. mrshenli Fixes https://github.com/pytorch/pytorch/issues/67626 cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse SciPioneer H-Huang Reviewed By: pritamdamania87 Differential Revision: D33243348 Pulled By: kumpera fbshipit-source-id: e1e8c34bb3d170407c0a793e2e585357f905d3c6 (cherry picked from commit 1ad5a7ceea)
2025-12-06 12:20:52 +01:00 · 2022-01-19 15:17:38 -08:00 · 2022-01-19 15:17:38 -08:00 · ef4bc3fa2f
commit ef4bc3fa2f
parent 70c9146c40
2 changed files with 51 additions and 20 deletions
--- a/torch/distributed/rpc/rref_proxy.py
+++ b/torch/distributed/rpc/rref_proxy.py
@ -1,9 +1,11 @@
 from functools import partial

 from . import functions
+from . import rpc_async

 import torch
 from .constants import UNSET_RPC_TIMEOUT
+from torch.futures import Future

 def _local_invoke(rref, func_name, args, kwargs):
    return getattr(rref.local_value(), func_name)(*args, **kwargs)
@ -13,26 +15,52 @@ def _local_invoke_async_execution(rref, func_name, args, kwargs):
    return getattr(rref.local_value(), func_name)(*args, **kwargs)

 def _invoke_rpc(rref, rpc_api, func_name, timeout, *args, **kwargs):
-    # Since rref._get_type can potentially issue an RPC, it should respect the
-    # passed in timeout here.
-    rref_type = rref._get_type(timeout=timeout)
+    def _rref_type_cont(rref_fut):
+        rref_type = rref_fut.value()

-    _invoke_func = _local_invoke
-    # Bypass ScriptModules when checking for async function attribute.
-    bypass_type = issubclass(rref_type, torch.jit.ScriptModule) or issubclass(
-        rref_type, torch._C.ScriptModule
-    )
-    if not bypass_type:
-        func = getattr(rref_type, func_name)
-        if hasattr(func, "_wrapped_async_rpc_function"):
-            _invoke_func = _local_invoke_async_execution
+        _invoke_func = _local_invoke
+        # Bypass ScriptModules when checking for async function attribute.
+        bypass_type = issubclass(rref_type, torch.jit.ScriptModule) or issubclass(
+            rref_type, torch._C.ScriptModule
+        )
+        if not bypass_type:
+            func = getattr(rref_type, func_name)
+            if hasattr(func, "_wrapped_async_rpc_function"):
+                _invoke_func = _local_invoke_async_execution

-    return rpc_api(
-        rref.owner(),
-        _invoke_func,
-        args=(rref, func_name, args, kwargs),
-        timeout=timeout
-    )
+        return rpc_api(
+            rref.owner(),
+            _invoke_func,
+            args=(rref, func_name, args, kwargs),
+            timeout=timeout
+        )
+
+    rref_fut = rref._get_type(timeout=timeout, blocking=False)
+
+    if rpc_api != rpc_async:
+        rref_fut.wait()
+        return _rref_type_cont(rref_fut)
+    else:
+        # A little explanation on this.
+        # rpc_async returns a Future pointing to the return value of `func_name`, it returns a `Future[T]`
+        # Calling _rref_type_cont from the `then` lambda causes Future wrapping. IOW, `then` returns a `Future[Future[T]]`
+        # To address that, we return a Future that is completed with the result of the async call.
+        result: Future = Future()
+
+        def _wrap_rref_type_cont(fut):
+            try:
+                _rref_type_cont(fut).then(_complete_op)
+            except BaseException as ex:
+                result.set_exception(ex)
+
+        def _complete_op(fut):
+            try:
+                result.set_result(fut.value())
+            except BaseException as ex:
+                result.set_exception(ex)
+
+        rref_fut.then(lambda fut: _wrap_rref_type_cont(fut))
+        return result

 # This class manages proxied RPC API calls for RRefs. It is entirely used from
 # C++ (see python_rpc_handler.cpp).
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@ -1144,7 +1144,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
            rref.rpc_sync().non_exist()

        with self.assertRaisesRegex(AttributeError, msg):
-            rref.rpc_async().non_exist()
+            rref.rpc_async().non_exist().wait()

        with self.assertRaisesRegex(AttributeError, msg):
            rref.remote().non_exist()
@ -4956,7 +4956,10 @@ class TensorPipeAgentRpcTest(RpcAgentTestFixture, RpcTestCommon):
        # which blocks on the RRef being created on owner node, until the
        # specified timeout.
        with self.assertRaisesRegex(RuntimeError, expected_error):
-            rref_api(timeout=timeout).my_instance_method(torch.ones(2, 2))
+            result = rref_api(timeout=timeout).my_instance_method(torch.ones(2, 2))
+            # rpc_async returns immediately and surface a timeout through wait()
+            if rref_api == slow_rref.rpc_async:
+                result.wait()

        # FIXME We wait until the remote completed creating the OwnerRRef
        # because there's currently a race if we shut down RPC before that.