mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[C10D] Extract some bits of TCPStore into TCPStoreBackend. (#105163)
This moves BackgroundThread to TCPStoreBackend.hpp. This will eventually be the interface shared between the current TCPStore backend and the new libuv one. Pull Request resolved: https://github.com/pytorch/pytorch/pull/105163 Approved by: https://github.com/fduwjj, https://github.com/H-Huang
This commit is contained in:
parent
b65b9e6ff4
commit
fe284b0d97
|
|
@ -528,6 +528,7 @@ libtorch_distributed_base_sources = [
|
||||||
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
|
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
|
||||||
"torch/csrc/distributed/c10d/Store.cpp",
|
"torch/csrc/distributed/c10d/Store.cpp",
|
||||||
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
|
||||||
"torch/csrc/distributed/c10d/Utils.cpp",
|
"torch/csrc/distributed/c10d/Utils.cpp",
|
||||||
"torch/csrc/distributed/c10d/comm.cpp",
|
"torch/csrc/distributed/c10d/comm.cpp",
|
||||||
"torch/csrc/distributed/c10d/debug.cpp",
|
"torch/csrc/distributed/c10d/debug.cpp",
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#include <c10/util/irange.h>
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
|
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
|
||||||
|
#include <torch/csrc/distributed/c10d/TCPStoreBackend.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/logging.h>
|
#include <torch/csrc/distributed/c10d/logging.h>
|
||||||
|
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
|
@ -29,114 +30,6 @@
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
namespace detail {
|
namespace detail {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
|
|
||||||
// Contains the windows/unix implementations to signal a
|
|
||||||
// shutdown sequence for the thread
|
|
||||||
class BackgroundThread {
|
|
||||||
public:
|
|
||||||
explicit BackgroundThread(Socket&& storeListenSocket);
|
|
||||||
|
|
||||||
virtual ~BackgroundThread() = 0;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void dispose();
|
|
||||||
|
|
||||||
Socket storeListenSocket_;
|
|
||||||
std::thread daemonThread_{};
|
|
||||||
std::vector<Socket> sockets_{};
|
|
||||||
#ifdef _WIN32
|
|
||||||
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
|
|
||||||
HANDLE ghStopEvent_{};
|
|
||||||
#else
|
|
||||||
std::array<int, 2> controlPipeFd_{{-1, -1}};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Initialization for shutdown signal
|
|
||||||
void initStopSignal();
|
|
||||||
// Triggers the shutdown signal
|
|
||||||
void stop();
|
|
||||||
// Joins the thread
|
|
||||||
void join();
|
|
||||||
// Clean up the shutdown signal
|
|
||||||
void closeStopSignal();
|
|
||||||
};
|
|
||||||
|
|
||||||
// Background thread parent class methods
|
|
||||||
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
|
|
||||||
: storeListenSocket_{std::move(storeListenSocket)} {
|
|
||||||
// Signal instance destruction to the daemon thread.
|
|
||||||
initStopSignal();
|
|
||||||
}
|
|
||||||
|
|
||||||
BackgroundThread::~BackgroundThread() = default;
|
|
||||||
|
|
||||||
// WARNING:
|
|
||||||
// Since we rely on the subclass for the daemon thread clean-up, we cannot
|
|
||||||
// destruct our member variables in the destructor. The subclass must call
|
|
||||||
// dispose() in its own destructor.
|
|
||||||
void BackgroundThread::dispose() {
|
|
||||||
// Stop the run
|
|
||||||
stop();
|
|
||||||
// Join the thread
|
|
||||||
join();
|
|
||||||
// Close unclosed sockets
|
|
||||||
sockets_.clear();
|
|
||||||
// Now close the rest control pipe
|
|
||||||
closeStopSignal();
|
|
||||||
}
|
|
||||||
|
|
||||||
void BackgroundThread::join() {
|
|
||||||
daemonThread_.join();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
void BackgroundThread::initStopSignal() {
|
|
||||||
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
|
|
||||||
if (ghStopEvent_ == NULL) {
|
|
||||||
TORCH_CHECK(
|
|
||||||
false,
|
|
||||||
"Failed to create the control pipe to start the "
|
|
||||||
"BackgroundThread run");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void BackgroundThread::closeStopSignal() {
|
|
||||||
CloseHandle(ghStopEvent_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void BackgroundThread::stop() {
|
|
||||||
SetEvent(ghStopEvent_);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
void BackgroundThread::initStopSignal() {
|
|
||||||
if (pipe(controlPipeFd_.data()) == -1) {
|
|
||||||
TORCH_CHECK(
|
|
||||||
false,
|
|
||||||
"Failed to create the control pipe to start the "
|
|
||||||
"BackgroundThread run");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void BackgroundThread::closeStopSignal() {
|
|
||||||
for (int fd : controlPipeFd_) {
|
|
||||||
if (fd != -1) {
|
|
||||||
::close(fd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void BackgroundThread::stop() {
|
|
||||||
if (controlPipeFd_[1] != -1) {
|
|
||||||
::write(controlPipeFd_[1], "\0", 1);
|
|
||||||
// close the write end of the pipe
|
|
||||||
::close(controlPipeFd_[1]);
|
|
||||||
controlPipeFd_[1] = -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
enum class QueryType : uint8_t {
|
enum class QueryType : uint8_t {
|
||||||
SET,
|
SET,
|
||||||
COMPARE_SET,
|
COMPARE_SET,
|
||||||
|
|
|
||||||
105
torch/csrc/distributed/c10d/TCPStoreBackend.cpp
Normal file
105
torch/csrc/distributed/c10d/TCPStoreBackend.cpp
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
#include <torch/csrc/distributed/c10d/TCPStoreBackend.hpp>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <system_error>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <io.h>
|
||||||
|
#include <winsock2.h>
|
||||||
|
#else
|
||||||
|
#include <poll.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <torch/csrc/distributed/c10d/WinSockUtils.hpp>
|
||||||
|
#else
|
||||||
|
#include <torch/csrc/distributed/c10d/UnixSockUtils.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <torch/csrc/distributed/c10d/socket.h>
|
||||||
|
|
||||||
|
namespace c10d {
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
// Background thread parent class methods
|
||||||
|
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
|
||||||
|
: storeListenSocket_{std::move(storeListenSocket)} {
|
||||||
|
// Signal instance destruction to the daemon thread.
|
||||||
|
initStopSignal();
|
||||||
|
}
|
||||||
|
|
||||||
|
BackgroundThread::~BackgroundThread() = default;
|
||||||
|
|
||||||
|
// WARNING:
|
||||||
|
// Since we rely on the subclass for the daemon thread clean-up, we cannot
|
||||||
|
// destruct our member variables in the destructor. The subclass must call
|
||||||
|
// dispose() in its own destructor.
|
||||||
|
void BackgroundThread::dispose() {
|
||||||
|
// Stop the run
|
||||||
|
stop();
|
||||||
|
// Join the thread
|
||||||
|
join();
|
||||||
|
// Close unclosed sockets
|
||||||
|
sockets_.clear();
|
||||||
|
// Now close the rest control pipe
|
||||||
|
closeStopSignal();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundThread::join() {
|
||||||
|
daemonThread_.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
void BackgroundThread::initStopSignal() {
|
||||||
|
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||||
|
if (ghStopEvent_ == NULL) {
|
||||||
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
|
"Failed to create the control pipe to start the "
|
||||||
|
"BackgroundThread run");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundThread::closeStopSignal() {
|
||||||
|
CloseHandle(ghStopEvent_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundThread::stop() {
|
||||||
|
SetEvent(ghStopEvent_);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
void BackgroundThread::initStopSignal() {
|
||||||
|
if (pipe(controlPipeFd_.data()) == -1) {
|
||||||
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
|
"Failed to create the control pipe to start the "
|
||||||
|
"BackgroundThread run");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundThread::closeStopSignal() {
|
||||||
|
for (int fd : controlPipeFd_) {
|
||||||
|
if (fd != -1) {
|
||||||
|
::close(fd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundThread::stop() {
|
||||||
|
if (controlPipeFd_[1] != -1) {
|
||||||
|
::write(controlPipeFd_[1], "\0", 1);
|
||||||
|
// close the write end of the pipe
|
||||||
|
::close(controlPipeFd_[1]);
|
||||||
|
controlPipeFd_[1] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
} // namespace c10d
|
||||||
54
torch/csrc/distributed/c10d/TCPStoreBackend.hpp
Normal file
54
torch/csrc/distributed/c10d/TCPStoreBackend.hpp
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <torch/csrc/distributed/c10d/socket.h>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <io.h>
|
||||||
|
#include <winsock2.h>
|
||||||
|
#else
|
||||||
|
#include <poll.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace c10d {
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
|
||||||
|
// Contains the windows/unix implementations to signal a
|
||||||
|
// shutdown sequence for the thread
|
||||||
|
class BackgroundThread {
|
||||||
|
public:
|
||||||
|
explicit BackgroundThread(Socket&& storeListenSocket);
|
||||||
|
|
||||||
|
virtual ~BackgroundThread() = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void dispose();
|
||||||
|
|
||||||
|
Socket storeListenSocket_;
|
||||||
|
std::thread daemonThread_{};
|
||||||
|
std::vector<Socket> sockets_{};
|
||||||
|
#ifdef _WIN32
|
||||||
|
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
|
||||||
|
HANDLE ghStopEvent_{};
|
||||||
|
#else
|
||||||
|
std::array<int, 2> controlPipeFd_{{-1, -1}};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Initialization for shutdown signal
|
||||||
|
void initStopSignal();
|
||||||
|
// Triggers the shutdown signal
|
||||||
|
void stop();
|
||||||
|
// Joins the thread
|
||||||
|
void join();
|
||||||
|
// Clean up the shutdown signal
|
||||||
|
void closeStopSignal();
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
} // namespace c10d
|
||||||
Loading…
Reference in New Issue
Block a user