[C10D] Extract some bits of TCPStore into TCPStoreBackend. (#105163)

This moves BackgroundThread to TCPStoreBackend.hpp. This will eventually be the
interface shared between the current TCPStore backend and the new libuv one.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/105163
Approved by: https://github.com/fduwjj, https://github.com/H-Huang
This commit is contained in:
Rodrigo Kumpera 2023-07-24 06:51:51 -07:00 committed by PyTorch MergeBot
parent b65b9e6ff4
commit fe284b0d97
4 changed files with 161 additions and 108 deletions

View File

@ -528,6 +528,7 @@ libtorch_distributed_base_sources = [
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
"torch/csrc/distributed/c10d/Store.cpp",
"torch/csrc/distributed/c10d/TCPStore.cpp",
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
"torch/csrc/distributed/c10d/Utils.cpp",
"torch/csrc/distributed/c10d/comm.cpp",
"torch/csrc/distributed/c10d/debug.cpp",

View File

@ -1,5 +1,6 @@
#include <c10/util/irange.h>
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
#include <torch/csrc/distributed/c10d/TCPStoreBackend.hpp>
#include <torch/csrc/distributed/c10d/logging.h>
#include <fcntl.h>
@ -29,114 +30,6 @@
namespace c10d {
namespace detail {
namespace {
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
// Contains the windows/unix implementations to signal a
// shutdown sequence for the thread
class BackgroundThread {
public:
explicit BackgroundThread(Socket&& storeListenSocket);
virtual ~BackgroundThread() = 0;
protected:
void dispose();
Socket storeListenSocket_;
std::thread daemonThread_{};
std::vector<Socket> sockets_{};
#ifdef _WIN32
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
HANDLE ghStopEvent_{};
#else
std::array<int, 2> controlPipeFd_{{-1, -1}};
#endif
private:
// Initialization for shutdown signal
void initStopSignal();
// Triggers the shutdown signal
void stop();
// Joins the thread
void join();
// Clean up the shutdown signal
void closeStopSignal();
};
// Background thread parent class methods
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
: storeListenSocket_{std::move(storeListenSocket)} {
// Signal instance destruction to the daemon thread.
initStopSignal();
}
BackgroundThread::~BackgroundThread() = default;
// WARNING:
// Since we rely on the subclass for the daemon thread clean-up, we cannot
// destruct our member variables in the destructor. The subclass must call
// dispose() in its own destructor.
void BackgroundThread::dispose() {
// Stop the run
stop();
// Join the thread
join();
// Close unclosed sockets
sockets_.clear();
// Now close the rest control pipe
closeStopSignal();
}
void BackgroundThread::join() {
daemonThread_.join();
}
#ifdef _WIN32
void BackgroundThread::initStopSignal() {
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
if (ghStopEvent_ == NULL) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void BackgroundThread::closeStopSignal() {
CloseHandle(ghStopEvent_);
}
void BackgroundThread::stop() {
SetEvent(ghStopEvent_);
}
#else
void BackgroundThread::initStopSignal() {
if (pipe(controlPipeFd_.data()) == -1) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void BackgroundThread::closeStopSignal() {
for (int fd : controlPipeFd_) {
if (fd != -1) {
::close(fd);
}
}
}
void BackgroundThread::stop() {
if (controlPipeFd_[1] != -1) {
::write(controlPipeFd_[1], "\0", 1);
// close the write end of the pipe
::close(controlPipeFd_[1]);
controlPipeFd_[1] = -1;
}
}
#endif
enum class QueryType : uint8_t {
SET,
COMPARE_SET,

View File

@ -0,0 +1,105 @@
#include <torch/csrc/distributed/c10d/TCPStoreBackend.hpp>
#include <c10/util/irange.h>
#include <fcntl.h>
#include <algorithm>
#include <array>
#include <system_error>
#include <unordered_map>
#include <utility>
#ifdef _WIN32
#include <io.h>
#include <winsock2.h>
#else
#include <poll.h>
#include <unistd.h>
#endif
#ifdef _WIN32
#include <torch/csrc/distributed/c10d/WinSockUtils.hpp>
#else
#include <torch/csrc/distributed/c10d/UnixSockUtils.hpp>
#endif
#include <torch/csrc/distributed/c10d/socket.h>
namespace c10d {
namespace detail {
// Background thread parent class methods
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
: storeListenSocket_{std::move(storeListenSocket)} {
// Signal instance destruction to the daemon thread.
initStopSignal();
}
BackgroundThread::~BackgroundThread() = default;
// WARNING:
// Since we rely on the subclass for the daemon thread clean-up, we cannot
// destruct our member variables in the destructor. The subclass must call
// dispose() in its own destructor.
void BackgroundThread::dispose() {
// Stop the run
stop();
// Join the thread
join();
// Close unclosed sockets
sockets_.clear();
// Now close the rest control pipe
closeStopSignal();
}
void BackgroundThread::join() {
daemonThread_.join();
}
#ifdef _WIN32
void BackgroundThread::initStopSignal() {
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
if (ghStopEvent_ == NULL) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void BackgroundThread::closeStopSignal() {
CloseHandle(ghStopEvent_);
}
void BackgroundThread::stop() {
SetEvent(ghStopEvent_);
}
#else
void BackgroundThread::initStopSignal() {
if (pipe(controlPipeFd_.data()) == -1) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void BackgroundThread::closeStopSignal() {
for (int fd : controlPipeFd_) {
if (fd != -1) {
::close(fd);
}
}
}
void BackgroundThread::stop() {
if (controlPipeFd_[1] != -1) {
::write(controlPipeFd_[1], "\0", 1);
// close the write end of the pipe
::close(controlPipeFd_[1]);
controlPipeFd_[1] = -1;
}
}
#endif
} // namespace detail
} // namespace c10d

View File

@ -0,0 +1,54 @@
#pragma once
#include <chrono>
#include <thread>
#include <vector>
#include <torch/csrc/distributed/c10d/socket.h>
#ifdef _WIN32
#include <io.h>
#include <winsock2.h>
#else
#include <poll.h>
#include <unistd.h>
#endif
namespace c10d {
namespace detail {
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
// Contains the windows/unix implementations to signal a
// shutdown sequence for the thread
class BackgroundThread {
public:
explicit BackgroundThread(Socket&& storeListenSocket);
virtual ~BackgroundThread() = 0;
protected:
void dispose();
Socket storeListenSocket_;
std::thread daemonThread_{};
std::vector<Socket> sockets_{};
#ifdef _WIN32
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
HANDLE ghStopEvent_{};
#else
std::array<int, 2> controlPipeFd_{{-1, -1}};
#endif
private:
// Initialization for shutdown signal
void initStopSignal();
// Triggers the shutdown signal
void stop();
// Joins the thread
void join();
// Clean up the shutdown signal
void closeStopSignal();
};
} // namespace detail
} // namespace c10d