pytorch/test/cpp/c10d/HashStoreTest.cpp
Pritam Damania 704b0b3c67 [RESUBMIT] Standardize on error types for distributed errors. (#108191)
We have a plethora of error types for various errors raised from c10d. These include `RuntimeError`, `TimeoutError`, `SocketError`, `DistBackendError` etc.

This results in messy code during error handling somewhat like this:
```
if "NCCL" in exception_str:
  ...
if "Timed out initializing process group in store based barrier on rank" in exception_str:
  ...
if "The client socket has timed out after" in exception_str:
  ...
if "Broken pipe" in exception_str:
  ...
if "Connection reset by peer" in exception_str:
  ...
```

To address this issue, in this PR I've ensured added these error types:

1. **DistError** - the base type of all distributed errors
2. **DistBackendError** - this already existed and referred to PG backend errors
3. **DistStoreError** - for errors originating from the store
4. **DistNetworkError** - for general network errors coming from the socket library

Pull Request resolved: https://github.com/pytorch/pytorch/pull/108191
Approved by: https://github.com/H-Huang
2023-08-30 21:47:39 +00:00

100 lines
2.8 KiB
C++

#include <c10/util/irange.h>
#include "StoreTestCommon.hpp"
#include <unistd.h>
#include <iostream>
#include <thread>
#include <torch/csrc/distributed/c10d/HashStore.hpp>
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
constexpr int64_t kShortStoreTimeoutMillis = 100;
void testGetSet(std::string prefix = "") {
// Basic set/get
{
auto hashStore = c10::make_intrusive<c10d::HashStore>();
c10d::PrefixStore store(prefix, hashStore);
c10d::test::set(store, "key0", "value0");
c10d::test::set(store, "key1", "value1");
c10d::test::set(store, "key2", "value2");
c10d::test::check(store, "key0", "value0");
c10d::test::check(store, "key1", "value1");
c10d::test::check(store, "key2", "value2");
// Check compareSet, does not check return value
c10d::test::compareSet(store, "key0", "wrongExpectedValue", "newValue");
c10d::test::check(store, "key0", "value0");
c10d::test::compareSet(store, "key0", "value0", "newValue");
c10d::test::check(store, "key0", "newValue");
auto numKeys = store.getNumKeys();
EXPECT_EQ(numKeys, 3);
auto delSuccess = store.deleteKey("key0");
EXPECT_TRUE(delSuccess);
numKeys = store.getNumKeys();
EXPECT_EQ(numKeys, 2);
auto delFailure = store.deleteKey("badKeyName");
EXPECT_FALSE(delFailure);
auto timeout = std::chrono::milliseconds(kShortStoreTimeoutMillis);
store.setTimeout(timeout);
EXPECT_THROW(store.get("key0"), c10::DistStoreError);
}
// get() waits up to timeout_.
{
auto hashStore = c10::make_intrusive<c10d::HashStore>();
c10d::PrefixStore store(prefix, hashStore);
std::thread th([&]() { c10d::test::set(store, "key0", "value0"); });
c10d::test::check(store, "key0", "value0");
th.join();
}
}
void stressTestStore(std::string prefix = "") {
// Hammer on HashStore::add
const auto numThreads = 4;
const auto numIterations = 100;
std::vector<std::thread> threads;
c10d::test::Semaphore sem1, sem2;
auto hashStore = c10::make_intrusive<c10d::HashStore>();
c10d::PrefixStore store(prefix, hashStore);
for (C10_UNUSED const auto i : c10::irange(numThreads)) {
threads.emplace_back(std::thread([&] {
sem1.post();
sem2.wait();
for (C10_UNUSED const auto j : c10::irange(numIterations)) {
store.add("counter", 1);
}
}));
}
sem1.wait(numThreads);
sem2.post(numThreads);
for (auto& thread : threads) {
thread.join();
}
std::string expected = std::to_string(numThreads * numIterations);
c10d::test::check(store, "counter", expected);
}
TEST(HashStoreTest, testGetAndSet) {
testGetSet();
}
TEST(HashStoreTest, testGetAndSetWithPrefix) {
testGetSet("testPrefix");
}
TEST(HashStoreTest, testStressStore) {
stressTestStore();
}
TEST(HashStoreTest, testStressStoreWithPrefix) {
stressTestStore("testPrefix");
}