mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
This reverts commit 73f3d6d9aa.
Reapplies #150801
Test plan:
See #150801
submodule
Pull Request resolved: https://github.com/pytorch/pytorch/pull/151031
Approved by: https://github.com/fduwjj
This commit is contained in:
parent
b7c0fda163
commit
df4e5294a6
|
|
@ -284,6 +284,13 @@ The machine with rank 0 will be used to set up all connections.
|
||||||
This is the default method, meaning that ``init_method`` does not have to be specified (or
|
This is the default method, meaning that ``init_method`` does not have to be specified (or
|
||||||
can be ``env://``).
|
can be ``env://``).
|
||||||
|
|
||||||
|
Improving initialization time
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
* ``TORCH_GLOO_LAZY_INIT`` - establishes connections on demand rather than
|
||||||
|
using a full mesh which can greatly improve initialization time for non all2all
|
||||||
|
operations.
|
||||||
|
|
||||||
Post-Initialization
|
Post-Initialization
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ from torch.testing._internal.common_distributed import (
|
||||||
requires_gloo,
|
requires_gloo,
|
||||||
simple_sparse_reduce_tests,
|
simple_sparse_reduce_tests,
|
||||||
skip_if_lt_x_gpu,
|
skip_if_lt_x_gpu,
|
||||||
|
skip_if_win32,
|
||||||
verify_ddp_error_logged,
|
verify_ddp_error_logged,
|
||||||
)
|
)
|
||||||
from torch.testing._internal.common_utils import (
|
from torch.testing._internal.common_utils import (
|
||||||
|
|
@ -219,6 +220,8 @@ class TimeoutTest(test_c10d_common.AbstractTimeoutTest, TestCase):
|
||||||
|
|
||||||
|
|
||||||
class ProcessGroupGlooTest(MultiProcessTestCase):
|
class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||||
|
lazy_init = False
|
||||||
|
|
||||||
def _create_process_group_gloo(self, store, rank, world_size, opts):
|
def _create_process_group_gloo(self, store, rank, world_size, opts):
|
||||||
pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
|
pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
|
||||||
dist.barrier(group=pg)
|
dist.barrier(group=pg)
|
||||||
|
|
@ -231,7 +234,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||||
def opts(self, threads=2):
|
def opts(self, threads=2):
|
||||||
opts = c10d.ProcessGroupGloo._Options()
|
opts = c10d.ProcessGroupGloo._Options()
|
||||||
opts._timeout = 50.0
|
opts._timeout = 50.0
|
||||||
opts._devices = [create_device(interface=LOOPBACK)]
|
opts._devices = [create_device(interface=LOOPBACK, lazy_init=self.lazy_init)]
|
||||||
opts._threads = threads
|
opts._threads = threads
|
||||||
return opts
|
return opts
|
||||||
|
|
||||||
|
|
@ -241,8 +244,8 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||||
opts = c10d.ProcessGroupGloo._Options()
|
opts = c10d.ProcessGroupGloo._Options()
|
||||||
opts._timeout = 5.0
|
opts._timeout = 5.0
|
||||||
opts._devices = [
|
opts._devices = [
|
||||||
create_device(interface=LOOPBACK),
|
create_device(interface=LOOPBACK, lazy_init=self.lazy_init),
|
||||||
create_device(interface=LOOPBACK),
|
create_device(interface=LOOPBACK, lazy_init=self.lazy_init),
|
||||||
]
|
]
|
||||||
pg = self._create_process_group_gloo(store, self.rank, self.world_size, opts)
|
pg = self._create_process_group_gloo(store, self.rank, self.world_size, opts)
|
||||||
|
|
||||||
|
|
@ -2334,6 +2337,19 @@ class ReducerTest(TestCase):
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
|
|
||||||
|
@skip_if_win32()
|
||||||
|
class ProcessGroupGlooLazyInitTest(ProcessGroupGlooTest):
|
||||||
|
lazy_init = True
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
os.environ["TORCH_GLOO_LAZY_INIT"] = "1"
|
||||||
|
super().setUp()
|
||||||
|
|
||||||
|
def tearDown(self) -> None:
|
||||||
|
del os.environ["TORCH_GLOO_LAZY_INIT"]
|
||||||
|
return super().tearDown()
|
||||||
|
|
||||||
|
|
||||||
class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
|
class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
|
||||||
@property
|
@property
|
||||||
def device(self):
|
def device(self):
|
||||||
|
|
|
||||||
2
third_party/gloo
vendored
2
third_party/gloo
vendored
|
|
@ -1 +1 @@
|
||||||
Subproject commit e348db90d8677277e926c14c94ee2acfa77173d4
|
Subproject commit c61070427610ccd923efe3e7f8b3eca12bbcc31a
|
||||||
|
|
@ -570,9 +570,9 @@ class ProcessGroupGloo(Backend):
|
||||||
timeout: timedelta,
|
timeout: timedelta,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_device(hostname="", interface="") -> Device: ...
|
def create_device(hostname="", interface="", lazy_init=None) -> Device: ...
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_default_device() -> Device: ...
|
def create_default_device(lazy_init=None) -> Device: ...
|
||||||
def _set_default_timeout(self, timeout) -> None: ...
|
def _set_default_timeout(self, timeout) -> None: ...
|
||||||
|
|
||||||
class _ProcessGroupWrapper(Backend):
|
class _ProcessGroupWrapper(Backend):
|
||||||
|
|
|
||||||
|
|
@ -39,12 +39,14 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
|
||||||
GlooDeviceRegistry,
|
GlooDeviceRegistry,
|
||||||
::gloo::transport::Device,
|
::gloo::transport::Device,
|
||||||
const std::string& /* interface */,
|
const std::string& /* interface */,
|
||||||
const std::string& /* hostname */)
|
const std::string& /* hostname */,
|
||||||
|
bool /* lazyInit */)
|
||||||
|
|
||||||
#if GLOO_HAVE_TRANSPORT_TCP
|
#if GLOO_HAVE_TRANSPORT_TCP
|
||||||
static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
|
static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
|
||||||
const std::string& interfaceName,
|
const std::string& interfaceName,
|
||||||
const std::string& hostname) {
|
const std::string& hostname,
|
||||||
|
bool lazyInit) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
!interfaceName.empty() || !hostname.empty(),
|
!interfaceName.empty() || !hostname.empty(),
|
||||||
"GlooDeviceFactory::makeTCPDevice(): interface or hostname "
|
"GlooDeviceFactory::makeTCPDevice(): interface or hostname "
|
||||||
|
|
@ -56,7 +58,11 @@ static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
|
||||||
} else {
|
} else {
|
||||||
attr.hostname = hostname;
|
attr.hostname = hostname;
|
||||||
}
|
}
|
||||||
return ::gloo::transport::tcp::CreateDevice(attr);
|
if (lazyInit) {
|
||||||
|
return ::gloo::transport::tcp::CreateLazyDevice(attr);
|
||||||
|
} else {
|
||||||
|
return ::gloo::transport::tcp::CreateDevice(attr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Registry priority is per key identifier. We register TCP to `LINUX` for
|
// Registry priority is per key identifier. We register TCP to `LINUX` for
|
||||||
|
|
@ -69,12 +75,15 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice)
|
||||||
#if GLOO_HAVE_TRANSPORT_TCP_TLS
|
#if GLOO_HAVE_TRANSPORT_TCP_TLS
|
||||||
static std::shared_ptr<::gloo::transport::Device> makeTCPTLSDevice(
|
static std::shared_ptr<::gloo::transport::Device> makeTCPTLSDevice(
|
||||||
const std::string& interface,
|
const std::string& interface,
|
||||||
const std::string& hostname) {
|
const std::string& hostname,
|
||||||
|
bool lazyInit) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
!interface.empty() || !hostname.empty(),
|
!interface.empty() || !hostname.empty(),
|
||||||
"GlooDeviceFactory::makeTCPTLSDevice(): interface or hostname "
|
"GlooDeviceFactory::makeTCPTLSDevice(): interface or hostname "
|
||||||
"can't be empty");
|
"can't be empty");
|
||||||
|
|
||||||
|
TORCH_CHECK(!lazyInit, "TCP_TLS transport does not support lazy init");
|
||||||
|
|
||||||
::gloo::transport::tcp::attr attr;
|
::gloo::transport::tcp::attr attr;
|
||||||
if (!interface.empty()) {
|
if (!interface.empty()) {
|
||||||
attr.iface = interface;
|
attr.iface = interface;
|
||||||
|
|
@ -105,12 +114,15 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP_TLS, makeTCPTLSDevice)
|
||||||
#if GLOO_HAVE_TRANSPORT_UV
|
#if GLOO_HAVE_TRANSPORT_UV
|
||||||
static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
|
static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
|
||||||
const std::string& interfaceName,
|
const std::string& interfaceName,
|
||||||
const std::string& hostname) {
|
const std::string& hostname,
|
||||||
|
bool lazyInit) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
!interfaceName.empty() || !hostname.empty(),
|
!interfaceName.empty() || !hostname.empty(),
|
||||||
"GlooDeviceFactory::makeUVDevice(): interface or hostname "
|
"GlooDeviceFactory::makeUVDevice(): interface or hostname "
|
||||||
"can't be empty");
|
"can't be empty");
|
||||||
|
|
||||||
|
TORCH_CHECK(!lazyInit, "UV transport does not support lazy init");
|
||||||
|
|
||||||
::gloo::transport::uv::attr attr;
|
::gloo::transport::uv::attr attr;
|
||||||
if (!interfaceName.empty()) {
|
if (!interfaceName.empty()) {
|
||||||
attr.iface = interfaceName;
|
attr.iface = interfaceName;
|
||||||
|
|
@ -131,23 +143,27 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice)
|
||||||
namespace {
|
namespace {
|
||||||
std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
|
std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
|
||||||
const std::string& interfaceName,
|
const std::string& interfaceName,
|
||||||
const std::string& hostName) {
|
const std::string& hostName,
|
||||||
|
bool lazyInit) {
|
||||||
static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
|
static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
|
||||||
if (transportName.has_value()) {
|
if (transportName.has_value()) {
|
||||||
return GlooDeviceRegistry()->Create(
|
return GlooDeviceRegistry()->Create(
|
||||||
transportName.value().c_str(), interfaceName, hostName);
|
transportName.value().c_str(), interfaceName, hostName, lazyInit);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
return GlooDeviceRegistry()->Create("LINUX", interfaceName, hostName);
|
return GlooDeviceRegistry()->Create(
|
||||||
|
"LINUX", interfaceName, hostName, lazyInit);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
return GlooDeviceRegistry()->Create("APPLE", interfaceName, hostName);
|
return GlooDeviceRegistry()->Create(
|
||||||
|
"APPLE", interfaceName, hostName, lazyInit);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return GlooDeviceRegistry()->Create("WIN32", interfaceName, hostName);
|
return GlooDeviceRegistry()->Create(
|
||||||
|
"WIN32", interfaceName, hostName, lazyInit);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
@ -155,8 +171,8 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
||||||
makeDeviceForInterface(const std::string& interfaceName) {
|
makeDeviceForInterface(const std::string& interfaceName, bool lazyInit) {
|
||||||
auto device = makeGlooDevice(interfaceName, "");
|
auto device = makeGlooDevice(interfaceName, "", lazyInit);
|
||||||
if (!device) {
|
if (!device) {
|
||||||
TORCH_CHECK(false, "makeDeviceForInterface(): unsupported gloo device");
|
TORCH_CHECK(false, "makeDeviceForInterface(): unsupported gloo device");
|
||||||
}
|
}
|
||||||
|
|
@ -164,8 +180,8 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
||||||
makeDeviceForHostname(const std::string& hostname) {
|
makeDeviceForHostname(const std::string& hostname, bool lazyInit) {
|
||||||
auto device = makeGlooDevice("", hostname);
|
auto device = makeGlooDevice("", hostname, lazyInit);
|
||||||
if (!device) {
|
if (!device) {
|
||||||
TORCH_CHECK(false, "makeDeviceForHostname(): unsupported gloo device");
|
TORCH_CHECK(false, "makeDeviceForHostname(): unsupported gloo device");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,18 +14,21 @@ class TORCH_API GlooDeviceFactory {
|
||||||
public:
|
public:
|
||||||
// Create new device instance for specific interface.
|
// Create new device instance for specific interface.
|
||||||
static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
|
static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
|
||||||
const std::string& interface);
|
const std::string& interface,
|
||||||
|
bool lazyInit);
|
||||||
|
|
||||||
// Create new device instance for specific hostname or address.
|
// Create new device instance for specific hostname or address.
|
||||||
static std::shared_ptr<::gloo::transport::Device> makeDeviceForHostname(
|
static std::shared_ptr<::gloo::transport::Device> makeDeviceForHostname(
|
||||||
const std::string& hostname);
|
const std::string& hostname,
|
||||||
|
bool lazyInit);
|
||||||
};
|
};
|
||||||
|
|
||||||
TORCH_DECLARE_SHARED_REGISTRY(
|
TORCH_DECLARE_SHARED_REGISTRY(
|
||||||
GlooDeviceRegistry,
|
GlooDeviceRegistry,
|
||||||
::gloo::transport::Device,
|
::gloo::transport::Device,
|
||||||
const std::string&, /* interface */
|
const std::string&, /* interface */
|
||||||
const std::string& /* hostname */);
|
const std::string&, /* hostname */
|
||||||
|
bool /* lazyInit */);
|
||||||
|
|
||||||
} // namespace c10d
|
} // namespace c10d
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -415,6 +415,10 @@ const auto kLoopbackAddress = "127.0.0.1";
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
bool getDefaultGlooLazyInit() {
|
||||||
|
return ::c10d::getCvarBool(TORCH_GLOO_LAZY_INIT, false);
|
||||||
|
}
|
||||||
|
|
||||||
// static
|
// static
|
||||||
void ProcessGroupGloo::AsyncWork::execute(
|
void ProcessGroupGloo::AsyncWork::execute(
|
||||||
const c10::intrusive_ptr<AsyncWork>& work) {
|
const c10::intrusive_ptr<AsyncWork>& work) {
|
||||||
|
|
@ -687,23 +691,24 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
createDeviceForInterface(const std::string& interface_name) {
|
createDeviceForInterface(const std::string& interface_name, bool lazyInit) {
|
||||||
return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name);
|
return ::c10d::GlooDeviceFactory::makeDeviceForInterface(
|
||||||
|
interface_name, lazyInit);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
createDeviceForHostname(const std::string& hostname) {
|
createDeviceForHostname(const std::string& hostname, bool lazyInit) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
doesHostnameResolveToUsableAddress(hostname),
|
doesHostnameResolveToUsableAddress(hostname),
|
||||||
"Cannot resolve ",
|
"Cannot resolve ",
|
||||||
hostname,
|
hostname,
|
||||||
" to a (local) address");
|
" to a (local) address");
|
||||||
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname);
|
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname, lazyInit);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
createDefaultDevice() {
|
createDefaultDevice(bool lazyInit) {
|
||||||
// Use the hostname to resolve the network address to
|
// Use the hostname to resolve the network address to
|
||||||
// use. Note: if the hostname does not resolve to an address (e.g.
|
// use. Note: if the hostname does not resolve to an address (e.g.
|
||||||
// because of misconfigured /etc/hosts file), this will not work.
|
// because of misconfigured /etc/hosts file), this will not work.
|
||||||
|
|
@ -716,7 +721,8 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
|
|
||||||
// Use this machine's hostname if it resolves to an address.
|
// Use this machine's hostname if it resolves to an address.
|
||||||
if (doesHostnameResolveToUsableAddress(hostname.data())) {
|
if (doesHostnameResolveToUsableAddress(hostname.data())) {
|
||||||
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname.data());
|
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
|
||||||
|
hostname.data(), lazyInit);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise, use the loopback address.
|
// Otherwise, use the loopback address.
|
||||||
|
|
@ -724,13 +730,13 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
"Unable to resolve hostname to a (local) address. ",
|
"Unable to resolve hostname to a (local) address. ",
|
||||||
"Using the loopback address as fallback. ",
|
"Using the loopback address as fallback. ",
|
||||||
"Manually set the network interface to bind to with GLOO_SOCKET_IFNAME.");
|
"Manually set the network interface to bind to with GLOO_SOCKET_IFNAME.");
|
||||||
return createDeviceForHostname(kLoopbackAddress);
|
return createDeviceForHostname(kLoopbackAddress, lazyInit);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
createDefaultDevice() {
|
createDefaultDevice(bool lazyInit) {
|
||||||
// Use the hostname to resolve the network address to
|
// Use the hostname to resolve the network address to
|
||||||
// use. Note: if the hostname does not resolve to an address (e.g.
|
// use. Note: if the hostname does not resolve to an address (e.g.
|
||||||
// because of misconfigured /etc/hosts file), this will not work.
|
// because of misconfigured /etc/hosts file), this will not work.
|
||||||
|
|
@ -743,7 +749,8 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
|
|
||||||
// Use this machine's hostname if it resolves to an address.
|
// Use this machine's hostname if it resolves to an address.
|
||||||
if (doesHostnameResolveToUsableAddress(hostname.get())) {
|
if (doesHostnameResolveToUsableAddress(hostname.get())) {
|
||||||
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname.get());
|
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
|
||||||
|
hostname.get(), lazyInit);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise, use the loopback address.
|
// Otherwise, use the loopback address.
|
||||||
|
|
@ -751,7 +758,7 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
||||||
"Unable to resolve hostname to a (local) address. ",
|
"Unable to resolve hostname to a (local) address. ",
|
||||||
"Using the loopback address as fallback. ",
|
"Using the loopback address as fallback. ",
|
||||||
"Manually set the network interface to bind to with GLOO_SOCKET_IFNAME.");
|
"Manually set the network interface to bind to with GLOO_SOCKET_IFNAME.");
|
||||||
return createDeviceForHostname(kLoopbackAddress);
|
return createDeviceForHostname(kLoopbackAddress, lazyInit);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,13 @@ namespace c10d {
|
||||||
|
|
||||||
constexpr const char* GLOO_BACKEND_NAME = "gloo";
|
constexpr const char* GLOO_BACKEND_NAME = "gloo";
|
||||||
|
|
||||||
|
// Control whether or not connections are established in a full mesh or lazily
|
||||||
|
// as needed.
|
||||||
|
static std::vector<std::string> TORCH_GLOO_LAZY_INIT = {"TORCH_GLOO_LAZY_INIT"};
|
||||||
|
|
||||||
|
// Returns default value for lazyInit.
|
||||||
|
bool TORCH_API getDefaultGlooLazyInit();
|
||||||
|
|
||||||
// ProcessGroupGloo implements Gloo bindings for c10d.
|
// ProcessGroupGloo implements Gloo bindings for c10d.
|
||||||
//
|
//
|
||||||
// All functions on this class are expected to be called in the same
|
// All functions on this class are expected to be called in the same
|
||||||
|
|
@ -244,24 +251,20 @@ class TORCH_API ProcessGroupGloo : public Backend {
|
||||||
|
|
||||||
// Create new device instance for specific interface.
|
// Create new device instance for specific interface.
|
||||||
static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
|
static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
|
||||||
const std::string& interface);
|
const std::string& interface,
|
||||||
|
bool lazyInit = false);
|
||||||
|
|
||||||
// Create new device instance for specific hostname or address.
|
// Create new device instance for specific hostname or address.
|
||||||
static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
|
static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
|
||||||
const std::string& hostname);
|
const std::string& hostname,
|
||||||
|
bool lazyInit = false);
|
||||||
|
|
||||||
// Create new device instance.
|
// Create new device instance.
|
||||||
// It tries to resolve this machine's hostname and bind to that address.
|
// It tries to resolve this machine's hostname and bind to that address.
|
||||||
// If that fails (i.e. the hostname doesn't resolve to an address), it
|
// If that fails (i.e. the hostname doesn't resolve to an address), it
|
||||||
// falls back to binding to the loopback address.
|
// falls back to binding to the loopback address.
|
||||||
static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
|
static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(
|
||||||
|
bool lazyInit = false);
|
||||||
// Create ProcessGroupGloo instance.
|
|
||||||
static c10::intrusive_ptr<ProcessGroupGloo> createProcessGroupGloo(
|
|
||||||
const c10::intrusive_ptr<Store>& store,
|
|
||||||
int rank,
|
|
||||||
int size,
|
|
||||||
std::chrono::milliseconds timeout);
|
|
||||||
|
|
||||||
explicit ProcessGroupGloo(
|
explicit ProcessGroupGloo(
|
||||||
const c10::intrusive_ptr<Store>& store,
|
const c10::intrusive_ptr<Store>& store,
|
||||||
|
|
|
||||||
|
|
@ -2849,24 +2849,36 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
|
||||||
processGroupGloo
|
processGroupGloo
|
||||||
.def_static(
|
.def_static(
|
||||||
"create_device",
|
"create_device",
|
||||||
[](const std::string& hostname, const std::string& interface)
|
[](const std::string& hostname,
|
||||||
|
const std::string& interface,
|
||||||
|
std::optional<bool> lazyInit_)
|
||||||
-> std::shared_ptr<::gloo::transport::Device> {
|
-> std::shared_ptr<::gloo::transport::Device> {
|
||||||
|
bool lazyInit =
|
||||||
|
lazyInit_.value_or(::c10d::getDefaultGlooLazyInit());
|
||||||
|
|
||||||
if (!hostname.empty()) {
|
if (!hostname.empty()) {
|
||||||
return ::c10d::ProcessGroupGloo::createDeviceForHostname(
|
return ::c10d::ProcessGroupGloo::createDeviceForHostname(
|
||||||
hostname);
|
hostname, lazyInit);
|
||||||
}
|
}
|
||||||
if (!interface.empty()) {
|
if (!interface.empty()) {
|
||||||
return ::c10d::ProcessGroupGloo::createDeviceForInterface(
|
return ::c10d::ProcessGroupGloo::createDeviceForInterface(
|
||||||
interface);
|
interface, lazyInit);
|
||||||
}
|
}
|
||||||
throw std::invalid_argument(
|
throw std::invalid_argument(
|
||||||
"Specify either `hostname` or `interface` argument.");
|
"Specify either `hostname` or `interface` argument.");
|
||||||
},
|
},
|
||||||
py::arg("hostname") = "",
|
py::arg("hostname") = "",
|
||||||
py::arg("interface") = "")
|
py::arg("interface") = "",
|
||||||
|
py::arg("lazy_init") = std::nullopt)
|
||||||
.def_static(
|
.def_static(
|
||||||
"create_default_device",
|
"create_default_device",
|
||||||
&::c10d::ProcessGroupGloo::createDefaultDevice);
|
[](std::optional<bool> lazyInit_) {
|
||||||
|
bool lazyInit =
|
||||||
|
lazyInit_.value_or(::c10d::getDefaultGlooLazyInit());
|
||||||
|
|
||||||
|
return ::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit);
|
||||||
|
},
|
||||||
|
py::arg("lazy_init") = std::nullopt);
|
||||||
|
|
||||||
processGroupGloo
|
processGroupGloo
|
||||||
.def(
|
.def(
|
||||||
|
|
@ -2898,20 +2910,22 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
|
||||||
py::gil_scoped_release nogil{};
|
py::gil_scoped_release nogil{};
|
||||||
|
|
||||||
auto options = ::c10d::ProcessGroupGloo::Options::create();
|
auto options = ::c10d::ProcessGroupGloo::Options::create();
|
||||||
|
bool lazyInit = ::c10d::getDefaultGlooLazyInit();
|
||||||
|
|
||||||
// Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
|
// Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
|
||||||
char* ifnameEnv = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
|
char* ifnameEnv = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
|
||||||
if (ifnameEnv && strlen(ifnameEnv) > 1) {
|
if (ifnameEnv && strlen(ifnameEnv) > 1) {
|
||||||
for (const auto& iface : ::c10d::split(',', ifnameEnv)) {
|
for (const auto& iface : ::c10d::split(',', ifnameEnv)) {
|
||||||
options->devices.push_back(
|
options->devices.push_back(
|
||||||
::c10d::ProcessGroupGloo::createDeviceForInterface(iface));
|
::c10d::ProcessGroupGloo::createDeviceForInterface(
|
||||||
|
iface, lazyInit));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// If no hostname is specified, this function looks up
|
// If no hostname is specified, this function looks up
|
||||||
// the machine's hostname and returns a device instance
|
// the machine's hostname and returns a device instance
|
||||||
// associated with the address that the hostname resolves to.
|
// associated with the address that the hostname resolves to.
|
||||||
options->devices.push_back(
|
options->devices.push_back(
|
||||||
::c10d::ProcessGroupGloo::createDefaultDevice());
|
::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit));
|
||||||
}
|
}
|
||||||
|
|
||||||
options->timeout = timeout;
|
options->timeout = timeout;
|
||||||
|
|
|
||||||
|
|
@ -442,11 +442,11 @@ if TEST_WITH_ROCM:
|
||||||
TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
|
TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
|
||||||
|
|
||||||
|
|
||||||
def create_device(interface=None):
|
def create_device(interface=None, lazy_init: bool = False):
|
||||||
if sys.platform == "win32" or interface is None:
|
if sys.platform == "win32" or interface is None:
|
||||||
return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
|
return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1", lazy_init=lazy_init)
|
||||||
else:
|
else:
|
||||||
return c10d.ProcessGroupGloo.create_device(interface=interface)
|
return c10d.ProcessGroupGloo.create_device(interface=interface, lazy_init=lazy_init)
|
||||||
|
|
||||||
|
|
||||||
def get_timeout(test_id) -> int:
|
def get_timeout(test_id) -> int:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user