mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
This updates the gloo submodule in PyTorch to a version that supports the new ibverbs backend that can be used with PyTorch.
Test plan:
```
sudo dnf install rdma-core-devel
USE_GLOO_IBVERBS=ON python setup.py develop
torchrun --nproc_per_node 2 ~/scripts/gloo_ibverbs_test.py
```
```py
"""
run with:
torchrun --nproc_per_node 2 ~/scripts/gloo_ibverbs_test.py
"""
import os
os.environ["GLOO_DEVICE_TRANSPORT"] = "IBVERBS"
import torch
import torch.distributed as dist
dist.init_process_group("gloo")
rank = dist.get_rank()
if rank == 0:
device = "cpu"
else:
device = "cuda"
print(device)
t = torch.full((10, 100), fill_value=(rank+1), device=device)
target = torch.full((10, 100), fill_value=3, device=device)
dist.all_reduce(t)
torch.testing.assert_close(t, target)
t = torch.full((10, 100), fill_value=(rank+1), device=device)
if rank == 0:
dist.send(t, dst=1)
else:
dist.recv(t, src=0)
torch.testing.assert_close(t, torch.full_like(t, 1))
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/153425
Approved by: https://github.com/fduwjj
242 lines
6.7 KiB
C++
242 lines
6.7 KiB
C++
#include <torch/csrc/distributed/c10d/GlooDeviceFactory.hpp>
|
|
|
|
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
|
|
|
#ifdef USE_C10D_GLOO
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <c10/util/Exception.h>
|
|
#include <c10/util/env.h>
|
|
|
|
#if GLOO_HAVE_TRANSPORT_TCP
|
|
#include <gloo/transport/tcp/device.h>
|
|
#endif
|
|
|
|
#if GLOO_HAVE_TRANSPORT_TCP_TLS
|
|
#include <gloo/transport/tcp/tls/device.h>
|
|
#endif
|
|
|
|
#if GLOO_HAVE_TRANSPORT_UV
|
|
#include <gloo/transport/uv/device.h>
|
|
#endif
|
|
|
|
#if GLOO_HAVE_TRANSPORT_IBVERBS
|
|
#include <gloo/transport/ibverbs/device.h>
|
|
#endif
|
|
|
|
// On Linux, check that the tcp transport is available.
|
|
#ifdef __linux__
|
|
#if !GLOO_HAVE_TRANSPORT_TCP
|
|
#error "Expected the tcp transport to be available on Linux."
|
|
#endif
|
|
#endif
|
|
|
|
// On macOS, check that the uv transport is available.
|
|
#ifdef __APPLE__
|
|
#if !GLOO_HAVE_TRANSPORT_UV
|
|
#error "Expected the uv transport to be available on macOS."
|
|
#endif
|
|
#endif
|
|
|
|
namespace c10d {
|
|
|
|
C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
|
|
GlooDeviceRegistry,
|
|
::gloo::transport::Device,
|
|
const std::string& /* interface */,
|
|
const std::string& /* hostname */,
|
|
bool /* lazyInit */)
|
|
|
|
#if GLOO_HAVE_TRANSPORT_TCP
|
|
static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
|
|
const std::string& interfaceName,
|
|
const std::string& hostname,
|
|
bool lazyInit) {
|
|
TORCH_CHECK(
|
|
!interfaceName.empty() || !hostname.empty(),
|
|
"GlooDeviceFactory::makeTCPDevice(): interface or hostname "
|
|
"can't be empty");
|
|
|
|
::gloo::transport::tcp::attr attr;
|
|
if (!interfaceName.empty()) {
|
|
attr.iface = interfaceName;
|
|
} else {
|
|
attr.hostname = hostname;
|
|
}
|
|
if (lazyInit) {
|
|
return ::gloo::transport::tcp::CreateLazyDevice(attr);
|
|
} else {
|
|
return ::gloo::transport::tcp::CreateDevice(attr);
|
|
}
|
|
}
|
|
|
|
// Registry priority is per key identifier. We register TCP to `LINUX` for
|
|
// the flexibility of other application to override by priority. Register
|
|
// TCP to `TCP` for env "GLOO_DEVICE_TRANSPORT" override.
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, LINUX, makeTCPDevice)
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice)
|
|
#endif
|
|
|
|
#if GLOO_HAVE_TRANSPORT_TCP_TLS
|
|
static std::shared_ptr<::gloo::transport::Device> makeTCPTLSDevice(
|
|
const std::string& interface,
|
|
const std::string& hostname,
|
|
bool lazyInit) {
|
|
TORCH_CHECK(
|
|
!interface.empty() || !hostname.empty(),
|
|
"GlooDeviceFactory::makeTCPTLSDevice(): interface or hostname "
|
|
"can't be empty");
|
|
|
|
TORCH_CHECK(!lazyInit, "TCP_TLS transport does not support lazy init");
|
|
|
|
::gloo::transport::tcp::attr attr;
|
|
if (!interface.empty()) {
|
|
attr.iface = interface;
|
|
} else {
|
|
attr.hostname = hostname;
|
|
}
|
|
const auto pkey_env =
|
|
c10::utils::get_env("GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY");
|
|
const auto pkey = pkey_env.has_value() ? pkey_env.value() : std::string();
|
|
const auto cert_env =
|
|
c10::utils::get_env("GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT");
|
|
const auto cert = cert_env.has_value() ? cert_env.value() : std::string();
|
|
const auto caFile_env =
|
|
c10::utils::get_env("GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE");
|
|
const auto caFile =
|
|
caFile_env.has_value() ? caFile_env.value() : std::string();
|
|
const auto caPath_env =
|
|
c10::utils::get_env("GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_PATH");
|
|
const auto caPath =
|
|
caPath_env.has_value() ? caPath_env.value() : std::string();
|
|
return ::gloo::transport::tcp::tls::CreateDevice(
|
|
attr, pkey, cert, caFile, caPath);
|
|
}
|
|
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP_TLS, makeTCPTLSDevice)
|
|
#endif
|
|
|
|
#if GLOO_HAVE_TRANSPORT_UV
|
|
static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
|
|
const std::string& interfaceName,
|
|
const std::string& hostname,
|
|
bool lazyInit) {
|
|
TORCH_CHECK(
|
|
!interfaceName.empty() || !hostname.empty(),
|
|
"GlooDeviceFactory::makeUVDevice(): interface or hostname "
|
|
"can't be empty");
|
|
|
|
TORCH_CHECK(!lazyInit, "UV transport does not support lazy init");
|
|
|
|
::gloo::transport::uv::attr attr;
|
|
if (!interfaceName.empty()) {
|
|
attr.iface = interfaceName;
|
|
} else {
|
|
attr.hostname = hostname;
|
|
}
|
|
return ::gloo::transport::uv::CreateDevice(attr);
|
|
}
|
|
|
|
// Registry priority is per key identifier. We register UV to `APPLE` for
|
|
// the flexibility of other application to override by priority. Register
|
|
// UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override.
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice)
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice)
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice)
|
|
#endif
|
|
|
|
#if GLOO_HAVE_TRANSPORT_IBVERBS
|
|
static std::shared_ptr<::gloo::transport::Device> makeIBVerbsDevice(
|
|
const std::string& interface,
|
|
const std::string& hostname,
|
|
bool lazyInit) {
|
|
if (!hostname.empty()) {
|
|
TORCH_WARN(
|
|
"ibverbs transport does not support hostname, defaulting to any");
|
|
}
|
|
|
|
TORCH_CHECK(!lazyInit, "transport does not support lazy init");
|
|
|
|
::gloo::transport::ibverbs::attr attr;
|
|
attr.name = getCvarString(
|
|
{
|
|
"TORCH_GLOO_IBV_NAME",
|
|
},
|
|
"");
|
|
attr.port = getCvarInt(
|
|
{
|
|
"TORCH_GLOO_IBV_PORT",
|
|
},
|
|
1);
|
|
attr.index = getCvarInt(
|
|
{
|
|
"TORCH_GLOO_IBV_INDEX",
|
|
},
|
|
0);
|
|
|
|
if (!interface.empty()) {
|
|
attr.name = interface;
|
|
}
|
|
|
|
// use global port
|
|
attr.port = 1;
|
|
|
|
return ::gloo::transport::ibverbs::CreateDevice(attr);
|
|
}
|
|
|
|
C10_REGISTER_CREATOR(GlooDeviceRegistry, IBVERBS, makeIBVerbsDevice)
|
|
#endif
|
|
|
|
namespace {
|
|
std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
|
|
const std::string& interfaceName,
|
|
const std::string& hostName,
|
|
bool lazyInit) {
|
|
static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
|
|
if (transportName.has_value()) {
|
|
return GlooDeviceRegistry()->Create(
|
|
transportName.value().c_str(), interfaceName, hostName, lazyInit);
|
|
}
|
|
|
|
#ifdef __linux__
|
|
return GlooDeviceRegistry()->Create(
|
|
"LINUX", interfaceName, hostName, lazyInit);
|
|
#endif
|
|
|
|
#ifdef __APPLE__
|
|
return GlooDeviceRegistry()->Create(
|
|
"APPLE", interfaceName, hostName, lazyInit);
|
|
#endif
|
|
|
|
#ifdef _WIN32
|
|
return GlooDeviceRegistry()->Create(
|
|
"WIN32", interfaceName, hostName, lazyInit);
|
|
#endif
|
|
|
|
return nullptr;
|
|
}
|
|
} // anonymous namespace
|
|
|
|
std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
|
makeDeviceForInterface(const std::string& interfaceName, bool lazyInit) {
|
|
auto device = makeGlooDevice(interfaceName, "", lazyInit);
|
|
if (!device) {
|
|
TORCH_CHECK(false, "makeDeviceForInterface(): unsupported gloo device");
|
|
}
|
|
return device;
|
|
}
|
|
|
|
std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
|
|
makeDeviceForHostname(const std::string& hostname, bool lazyInit) {
|
|
auto device = makeGlooDevice("", hostname, lazyInit);
|
|
if (!device) {
|
|
TORCH_CHECK(false, "makeDeviceForHostname(): unsupported gloo device");
|
|
}
|
|
return device;
|
|
}
|
|
|
|
} // namespace c10d
|
|
|
|
#endif // USE_C10D_GLOO
|