mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
We have a plethora of error types for various errors raised from c10d. These include `RuntimeError`, `TimeoutError`, `SocketError`, `DistBackendError` etc. This results in messy code during error handling somewhat like this: ``` if "NCCL" in exception_str: ... if "Timed out initializing process group in store based barrier on rank" in exception_str: ... if "The client socket has timed out after" in exception_str: ... if "Broken pipe" in exception_str: ... if "Connection reset by peer" in exception_str: ... ``` To address this issue, in this PR I've ensured added these error types: 1. **DistError** - the base type of all distributed errors 2. **DistBackendError** - this already existed and referred to PG backend errors 3. **DistStoreError** - for errors originating from the store 4. **DistNetworkError** - for general network errors coming from the socket library Pull Request resolved: https://github.com/pytorch/pytorch/pull/108191 Approved by: https://github.com/H-Huang
100 lines
2.8 KiB
C++
100 lines
2.8 KiB
C++
#include <c10/util/irange.h>
|
|
#include "StoreTestCommon.hpp"
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <iostream>
|
|
#include <thread>
|
|
|
|
#include <torch/csrc/distributed/c10d/HashStore.hpp>
|
|
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
|
|
|
|
constexpr int64_t kShortStoreTimeoutMillis = 100;
|
|
|
|
void testGetSet(std::string prefix = "") {
|
|
// Basic set/get
|
|
{
|
|
auto hashStore = c10::make_intrusive<c10d::HashStore>();
|
|
c10d::PrefixStore store(prefix, hashStore);
|
|
c10d::test::set(store, "key0", "value0");
|
|
c10d::test::set(store, "key1", "value1");
|
|
c10d::test::set(store, "key2", "value2");
|
|
c10d::test::check(store, "key0", "value0");
|
|
c10d::test::check(store, "key1", "value1");
|
|
c10d::test::check(store, "key2", "value2");
|
|
|
|
// Check compareSet, does not check return value
|
|
c10d::test::compareSet(store, "key0", "wrongExpectedValue", "newValue");
|
|
c10d::test::check(store, "key0", "value0");
|
|
c10d::test::compareSet(store, "key0", "value0", "newValue");
|
|
c10d::test::check(store, "key0", "newValue");
|
|
|
|
auto numKeys = store.getNumKeys();
|
|
EXPECT_EQ(numKeys, 3);
|
|
auto delSuccess = store.deleteKey("key0");
|
|
EXPECT_TRUE(delSuccess);
|
|
numKeys = store.getNumKeys();
|
|
EXPECT_EQ(numKeys, 2);
|
|
auto delFailure = store.deleteKey("badKeyName");
|
|
EXPECT_FALSE(delFailure);
|
|
auto timeout = std::chrono::milliseconds(kShortStoreTimeoutMillis);
|
|
store.setTimeout(timeout);
|
|
EXPECT_THROW(store.get("key0"), c10::DistStoreError);
|
|
}
|
|
|
|
// get() waits up to timeout_.
|
|
{
|
|
auto hashStore = c10::make_intrusive<c10d::HashStore>();
|
|
c10d::PrefixStore store(prefix, hashStore);
|
|
std::thread th([&]() { c10d::test::set(store, "key0", "value0"); });
|
|
c10d::test::check(store, "key0", "value0");
|
|
th.join();
|
|
}
|
|
}
|
|
|
|
void stressTestStore(std::string prefix = "") {
|
|
// Hammer on HashStore::add
|
|
const auto numThreads = 4;
|
|
const auto numIterations = 100;
|
|
|
|
std::vector<std::thread> threads;
|
|
c10d::test::Semaphore sem1, sem2;
|
|
auto hashStore = c10::make_intrusive<c10d::HashStore>();
|
|
c10d::PrefixStore store(prefix, hashStore);
|
|
|
|
for (C10_UNUSED const auto i : c10::irange(numThreads)) {
|
|
threads.emplace_back(std::thread([&] {
|
|
sem1.post();
|
|
sem2.wait();
|
|
for (C10_UNUSED const auto j : c10::irange(numIterations)) {
|
|
store.add("counter", 1);
|
|
}
|
|
}));
|
|
}
|
|
|
|
sem1.wait(numThreads);
|
|
sem2.post(numThreads);
|
|
|
|
for (auto& thread : threads) {
|
|
thread.join();
|
|
}
|
|
std::string expected = std::to_string(numThreads * numIterations);
|
|
c10d::test::check(store, "counter", expected);
|
|
}
|
|
|
|
TEST(HashStoreTest, testGetAndSet) {
|
|
testGetSet();
|
|
}
|
|
|
|
TEST(HashStoreTest, testGetAndSetWithPrefix) {
|
|
testGetSet("testPrefix");
|
|
}
|
|
|
|
TEST(HashStoreTest, testStressStore) {
|
|
stressTestStore();
|
|
}
|
|
|
|
TEST(HashStoreTest, testStressStoreWithPrefix) {
|
|
stressTestStore("testPrefix");
|
|
}
|