mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
c10d/gloo: add ibverbs backend (#153015)
Summary: X-link: https://github.com/pytorch/gloo/pull/437 This provides a new "UnboundBuffer" implementation for Gloo ibverbs backend so it can be used with PyTorch. This currently is passing basic tests such as `reduce_test` and `send_recv_test` but there are a number of failures. Putting this up for review so the follow up fixes are less of a mega PR and also so we can start doing some initial testing with this E2E with PyTorch. Known issues: * support recv from any is not supported * AllreduceBcubeBase2 is failing Test Plan: ``` buck2 run mode/dbgo //gloo/test:send_recv_test_ibverbs buck2 test //gloo/test: GLOO_DEVICE_TRANSPORT=IBVERBS buck2 run @//mode/opt //caffe2/test/distributed:c10d -- -r '.*gloo.*' -f ``` We can't run any of the gloo tests in CI since none of our CI machines have ibverbs so they're disabled by default and need to be manually run. Differential Revision: D73291471 Pull Request resolved: https://github.com/pytorch/pytorch/pull/153015 Approved by: https://github.com/fduwjj
This commit is contained in:
parent
7cdf5048ea
commit
d900c68ea6
|
|
@ -1,5 +1,7 @@
|
|||
#include <torch/csrc/distributed/c10d/GlooDeviceFactory.hpp>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
||||
|
||||
#ifdef USE_C10D_GLOO
|
||||
|
||||
#include <cstdlib>
|
||||
|
|
@ -19,6 +21,10 @@
|
|||
#include <gloo/transport/uv/device.h>
|
||||
#endif
|
||||
|
||||
#if GLOO_HAVE_TRANSPORT_IBVERBS
|
||||
#include <gloo/transport/ibverbs/device.h>
|
||||
#endif
|
||||
|
||||
// On Linux, check that the tcp transport is available.
|
||||
#ifdef __linux__
|
||||
#if !GLOO_HAVE_TRANSPORT_TCP
|
||||
|
|
@ -140,6 +146,45 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice)
|
|||
C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice)
|
||||
#endif
|
||||
|
||||
#if GLOO_HAVE_TRANSPORT_IBVERBS
|
||||
static std::shared_ptr<::gloo::transport::Device> makeIBVerbsDevice(
|
||||
const std::string& interface,
|
||||
const std::string& hostname,
|
||||
bool lazyInit) {
|
||||
TORCH_CHECK(hostname.empty(), "ibverbs transport does not support hostname");
|
||||
|
||||
TORCH_CHECK(!lazyInit, "transport does not support lazy init");
|
||||
|
||||
::gloo::transport::ibverbs::attr attr;
|
||||
attr.name = getCvarString(
|
||||
{
|
||||
"TORCH_GLOO_IBV_NAME",
|
||||
},
|
||||
"");
|
||||
attr.port = getCvarInt(
|
||||
{
|
||||
"TORCH_GLOO_IBV_PORT",
|
||||
},
|
||||
1);
|
||||
attr.index = getCvarInt(
|
||||
{
|
||||
"TORCH_GLOO_IBV_INDEX",
|
||||
},
|
||||
0);
|
||||
|
||||
if (!interface.empty()) {
|
||||
attr.name = interface;
|
||||
}
|
||||
|
||||
// use global port
|
||||
attr.port = 1;
|
||||
|
||||
return ::gloo::transport::ibverbs::CreateDevice(attr);
|
||||
}
|
||||
|
||||
C10_REGISTER_CREATOR(GlooDeviceRegistry, IBVERBS, makeIBVerbsDevice)
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
|
||||
const std::string& interfaceName,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user