mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[C10D] Extract some bits of TCPStore into TCPStoreBackend. (#105163)
This moves BackgroundThread to TCPStoreBackend.hpp. This will eventually be the interface shared between the current TCPStore backend and the new libuv one. Pull Request resolved: https://github.com/pytorch/pytorch/pull/105163 Approved by: https://github.com/fduwjj, https://github.com/H-Huang
This commit is contained in:
parent
b65b9e6ff4
commit
fe284b0d97
|
|
@ -528,6 +528,7 @@ libtorch_distributed_base_sources = [
|
|||
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
|
||||
"torch/csrc/distributed/c10d/Store.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStore.cpp",
|
||||
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
|
||||
"torch/csrc/distributed/c10d/Utils.cpp",
|
||||
"torch/csrc/distributed/c10d/comm.cpp",
|
||||
"torch/csrc/distributed/c10d/debug.cpp",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include <c10/util/irange.h>
|
||||
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
|
||||
#include <torch/csrc/distributed/c10d/TCPStoreBackend.hpp>
|
||||
#include <torch/csrc/distributed/c10d/logging.h>
|
||||
|
||||
#include <fcntl.h>
|
||||
|
|
@ -29,114 +30,6 @@
|
|||
namespace c10d {
|
||||
namespace detail {
|
||||
namespace {
|
||||
|
||||
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
|
||||
// Contains the windows/unix implementations to signal a
|
||||
// shutdown sequence for the thread
|
||||
class BackgroundThread {
|
||||
public:
|
||||
explicit BackgroundThread(Socket&& storeListenSocket);
|
||||
|
||||
virtual ~BackgroundThread() = 0;
|
||||
|
||||
protected:
|
||||
void dispose();
|
||||
|
||||
Socket storeListenSocket_;
|
||||
std::thread daemonThread_{};
|
||||
std::vector<Socket> sockets_{};
|
||||
#ifdef _WIN32
|
||||
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
|
||||
HANDLE ghStopEvent_{};
|
||||
#else
|
||||
std::array<int, 2> controlPipeFd_{{-1, -1}};
|
||||
#endif
|
||||
|
||||
private:
|
||||
// Initialization for shutdown signal
|
||||
void initStopSignal();
|
||||
// Triggers the shutdown signal
|
||||
void stop();
|
||||
// Joins the thread
|
||||
void join();
|
||||
// Clean up the shutdown signal
|
||||
void closeStopSignal();
|
||||
};
|
||||
|
||||
// Background thread parent class methods
|
||||
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
|
||||
: storeListenSocket_{std::move(storeListenSocket)} {
|
||||
// Signal instance destruction to the daemon thread.
|
||||
initStopSignal();
|
||||
}
|
||||
|
||||
BackgroundThread::~BackgroundThread() = default;
|
||||
|
||||
// WARNING:
|
||||
// Since we rely on the subclass for the daemon thread clean-up, we cannot
|
||||
// destruct our member variables in the destructor. The subclass must call
|
||||
// dispose() in its own destructor.
|
||||
void BackgroundThread::dispose() {
|
||||
// Stop the run
|
||||
stop();
|
||||
// Join the thread
|
||||
join();
|
||||
// Close unclosed sockets
|
||||
sockets_.clear();
|
||||
// Now close the rest control pipe
|
||||
closeStopSignal();
|
||||
}
|
||||
|
||||
void BackgroundThread::join() {
|
||||
daemonThread_.join();
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
void BackgroundThread::initStopSignal() {
|
||||
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
if (ghStopEvent_ == NULL) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Failed to create the control pipe to start the "
|
||||
"BackgroundThread run");
|
||||
}
|
||||
}
|
||||
|
||||
void BackgroundThread::closeStopSignal() {
|
||||
CloseHandle(ghStopEvent_);
|
||||
}
|
||||
|
||||
void BackgroundThread::stop() {
|
||||
SetEvent(ghStopEvent_);
|
||||
}
|
||||
#else
|
||||
void BackgroundThread::initStopSignal() {
|
||||
if (pipe(controlPipeFd_.data()) == -1) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Failed to create the control pipe to start the "
|
||||
"BackgroundThread run");
|
||||
}
|
||||
}
|
||||
|
||||
void BackgroundThread::closeStopSignal() {
|
||||
for (int fd : controlPipeFd_) {
|
||||
if (fd != -1) {
|
||||
::close(fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BackgroundThread::stop() {
|
||||
if (controlPipeFd_[1] != -1) {
|
||||
::write(controlPipeFd_[1], "\0", 1);
|
||||
// close the write end of the pipe
|
||||
::close(controlPipeFd_[1]);
|
||||
controlPipeFd_[1] = -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
enum class QueryType : uint8_t {
|
||||
SET,
|
||||
COMPARE_SET,
|
||||
|
|
|
|||
105
torch/csrc/distributed/c10d/TCPStoreBackend.cpp
Normal file
105
torch/csrc/distributed/c10d/TCPStoreBackend.cpp
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
#include <torch/csrc/distributed/c10d/TCPStoreBackend.hpp>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
#include <fcntl.h>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <system_error>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <io.h>
|
||||
#include <winsock2.h>
|
||||
#else
|
||||
#include <poll.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <torch/csrc/distributed/c10d/WinSockUtils.hpp>
|
||||
#else
|
||||
#include <torch/csrc/distributed/c10d/UnixSockUtils.hpp>
|
||||
#endif
|
||||
|
||||
#include <torch/csrc/distributed/c10d/socket.h>
|
||||
|
||||
namespace c10d {
|
||||
namespace detail {
|
||||
|
||||
// Background thread parent class methods
|
||||
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
|
||||
: storeListenSocket_{std::move(storeListenSocket)} {
|
||||
// Signal instance destruction to the daemon thread.
|
||||
initStopSignal();
|
||||
}
|
||||
|
||||
BackgroundThread::~BackgroundThread() = default;
|
||||
|
||||
// WARNING:
|
||||
// Since we rely on the subclass for the daemon thread clean-up, we cannot
|
||||
// destruct our member variables in the destructor. The subclass must call
|
||||
// dispose() in its own destructor.
|
||||
void BackgroundThread::dispose() {
|
||||
// Stop the run
|
||||
stop();
|
||||
// Join the thread
|
||||
join();
|
||||
// Close unclosed sockets
|
||||
sockets_.clear();
|
||||
// Now close the rest control pipe
|
||||
closeStopSignal();
|
||||
}
|
||||
|
||||
void BackgroundThread::join() {
|
||||
daemonThread_.join();
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
void BackgroundThread::initStopSignal() {
|
||||
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
if (ghStopEvent_ == NULL) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Failed to create the control pipe to start the "
|
||||
"BackgroundThread run");
|
||||
}
|
||||
}
|
||||
|
||||
void BackgroundThread::closeStopSignal() {
|
||||
CloseHandle(ghStopEvent_);
|
||||
}
|
||||
|
||||
void BackgroundThread::stop() {
|
||||
SetEvent(ghStopEvent_);
|
||||
}
|
||||
#else
|
||||
void BackgroundThread::initStopSignal() {
|
||||
if (pipe(controlPipeFd_.data()) == -1) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Failed to create the control pipe to start the "
|
||||
"BackgroundThread run");
|
||||
}
|
||||
}
|
||||
|
||||
void BackgroundThread::closeStopSignal() {
|
||||
for (int fd : controlPipeFd_) {
|
||||
if (fd != -1) {
|
||||
::close(fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BackgroundThread::stop() {
|
||||
if (controlPipeFd_[1] != -1) {
|
||||
::write(controlPipeFd_[1], "\0", 1);
|
||||
// close the write end of the pipe
|
||||
::close(controlPipeFd_[1]);
|
||||
controlPipeFd_[1] = -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
} // namespace c10d
|
||||
54
torch/csrc/distributed/c10d/TCPStoreBackend.hpp
Normal file
54
torch/csrc/distributed/c10d/TCPStoreBackend.hpp
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/socket.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <io.h>
|
||||
#include <winsock2.h>
|
||||
#else
|
||||
#include <poll.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
namespace c10d {
|
||||
namespace detail {
|
||||
|
||||
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
|
||||
// Contains the windows/unix implementations to signal a
|
||||
// shutdown sequence for the thread
|
||||
class BackgroundThread {
|
||||
public:
|
||||
explicit BackgroundThread(Socket&& storeListenSocket);
|
||||
|
||||
virtual ~BackgroundThread() = 0;
|
||||
|
||||
protected:
|
||||
void dispose();
|
||||
|
||||
Socket storeListenSocket_;
|
||||
std::thread daemonThread_{};
|
||||
std::vector<Socket> sockets_{};
|
||||
#ifdef _WIN32
|
||||
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
|
||||
HANDLE ghStopEvent_{};
|
||||
#else
|
||||
std::array<int, 2> controlPipeFd_{{-1, -1}};
|
||||
#endif
|
||||
|
||||
private:
|
||||
// Initialization for shutdown signal
|
||||
void initStopSignal();
|
||||
// Triggers the shutdown signal
|
||||
void stop();
|
||||
// Joins the thread
|
||||
void join();
|
||||
// Clean up the shutdown signal
|
||||
void closeStopSignal();
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
} // namespace c10d
|
||||
Loading…
Reference in New Issue
Block a user