[distributed] Make rref_proxy._invoke_rpc trully async when needed. (#70206)

Summary:
From https://github.com/pytorch/pytorch/issues/67626: RRefProxy (rref.rpc_async, rref.rpc_sync, rref.remote) currently uses a blocking RPC call to the owner

This is done by chaining async calls. In the sync case we wait on the
resulting Future.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/70206

Test Plan:
I ran rpc_tests using tensorpipe_rpc_agent_test_fixture.py and had to
adjust test_rref_proxy_timeout to the new behavior.

I ran into test_tensorpipe_set_default_timeout failing due to the
timeout being too small. Doesn't look related to this change.
mrshenli
Fixes https://github.com/pytorch/pytorch/issues/67626

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse SciPioneer H-Huang

Reviewed By: pritamdamania87

Differential Revision: D33243348

Pulled By: kumpera

fbshipit-source-id: e1e8c34bb3d170407c0a793e2e585357f905d3c6
(cherry picked from commit 1ad5a7ceea)
This commit is contained in:
Rodrigo Kumpera 2022-01-19 15:17:38 -08:00 committed by PyTorch MergeBot
parent 70c9146c40
commit ef4bc3fa2f
2 changed files with 51 additions and 20 deletions

View File

@ -1,9 +1,11 @@
from functools import partial
from . import functions
from . import rpc_async
import torch
from .constants import UNSET_RPC_TIMEOUT
from torch.futures import Future
def _local_invoke(rref, func_name, args, kwargs):
return getattr(rref.local_value(), func_name)(*args, **kwargs)
@ -13,26 +15,52 @@ def _local_invoke_async_execution(rref, func_name, args, kwargs):
return getattr(rref.local_value(), func_name)(*args, **kwargs)
def _invoke_rpc(rref, rpc_api, func_name, timeout, *args, **kwargs):
# Since rref._get_type can potentially issue an RPC, it should respect the
# passed in timeout here.
rref_type = rref._get_type(timeout=timeout)
def _rref_type_cont(rref_fut):
rref_type = rref_fut.value()
_invoke_func = _local_invoke
# Bypass ScriptModules when checking for async function attribute.
bypass_type = issubclass(rref_type, torch.jit.ScriptModule) or issubclass(
rref_type, torch._C.ScriptModule
)
if not bypass_type:
func = getattr(rref_type, func_name)
if hasattr(func, "_wrapped_async_rpc_function"):
_invoke_func = _local_invoke_async_execution
_invoke_func = _local_invoke
# Bypass ScriptModules when checking for async function attribute.
bypass_type = issubclass(rref_type, torch.jit.ScriptModule) or issubclass(
rref_type, torch._C.ScriptModule
)
if not bypass_type:
func = getattr(rref_type, func_name)
if hasattr(func, "_wrapped_async_rpc_function"):
_invoke_func = _local_invoke_async_execution
return rpc_api(
rref.owner(),
_invoke_func,
args=(rref, func_name, args, kwargs),
timeout=timeout
)
return rpc_api(
rref.owner(),
_invoke_func,
args=(rref, func_name, args, kwargs),
timeout=timeout
)
rref_fut = rref._get_type(timeout=timeout, blocking=False)
if rpc_api != rpc_async:
rref_fut.wait()
return _rref_type_cont(rref_fut)
else:
# A little explanation on this.
# rpc_async returns a Future pointing to the return value of `func_name`, it returns a `Future[T]`
# Calling _rref_type_cont from the `then` lambda causes Future wrapping. IOW, `then` returns a `Future[Future[T]]`
# To address that, we return a Future that is completed with the result of the async call.
result: Future = Future()
def _wrap_rref_type_cont(fut):
try:
_rref_type_cont(fut).then(_complete_op)
except BaseException as ex:
result.set_exception(ex)
def _complete_op(fut):
try:
result.set_result(fut.value())
except BaseException as ex:
result.set_exception(ex)
rref_fut.then(lambda fut: _wrap_rref_type_cont(fut))
return result
# This class manages proxied RPC API calls for RRefs. It is entirely used from
# C++ (see python_rpc_handler.cpp).

View File

@ -1144,7 +1144,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
rref.rpc_sync().non_exist()
with self.assertRaisesRegex(AttributeError, msg):
rref.rpc_async().non_exist()
rref.rpc_async().non_exist().wait()
with self.assertRaisesRegex(AttributeError, msg):
rref.remote().non_exist()
@ -4956,7 +4956,10 @@ class TensorPipeAgentRpcTest(RpcAgentTestFixture, RpcTestCommon):
# which blocks on the RRef being created on owner node, until the
# specified timeout.
with self.assertRaisesRegex(RuntimeError, expected_error):
rref_api(timeout=timeout).my_instance_method(torch.ones(2, 2))
result = rref_api(timeout=timeout).my_instance_method(torch.ones(2, 2))
# rpc_async returns immediately and surface a timeout through wait()
if rref_api == slow_rref.rpc_async:
result.wait()
# FIXME We wait until the remote completed creating the OwnerRRef
# because there's currently a race if we shut down RPC before that.