pytorch/test/distributed/elastic/utils/distributed_test.py
Can Balioglu 6e640a0acf Revise the socket implementation of c10d (#68226)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/68226

**Note that this PR is unusually big due to the urgency of the changes. Please reach out to me in case you wish to have a "pair" review.**

This PR introduces a major refactoring of the socket implementation of the C10d library. A big portion of the logic is now contained in the `Socket` class and a follow-up PR will further consolidate the remaining parts. As of today the changes in this PR offer:

 - significantly better error handling and much more verbose logging (see the example output below)
 - explicit support for IPv6 and dual-stack sockets
 - correct handling of signal interrupts
 - better Windows support

A follow-up PR will consolidate `send`/`recv` logic into `Socket` and fully migrate to non-blocking sockets.

## Example Output

```
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[W logging.h:28] The server socket on [localhost]:29501 is not yet listening (Error: 111 - Connection refused), retrying...
[I logging.h:21] The server socket will attempt to listen on an IPv6 address.
[I logging.h:21] The server socket is attempting to listen on [::]:29501.
[I logging.h:21] The server socket has started to listen on [::]:29501.
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42650.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42650.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42722.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42722.
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42724.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42724.
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42726.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42726.
```
ghstack-source-id: 143501987

Test Plan: Run existing unit and integration tests on devserver, Fedora, Ubuntu, macOS Big Sur, Windows 10.

Reviewed By: Babar, wilson100hong, mrshenli

Differential Revision: D32372333

fbshipit-source-id: 2204ffa28ed0d3683a9cb3ebe1ea8d92a831325a
2021-11-16 20:49:25 -08:00

156 lines
4.9 KiB
Python

#!/usr/bin/env python3
# Owner(s): ["oncall: r2p"]
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import multiprocessing as mp
import os
import socket
import sys
import unittest
from contextlib import closing
from torch.distributed.elastic.utils.distributed import (
create_c10d_store,
get_socket_with_port,
)
from torch.testing._internal.common_utils import (
IS_MACOS,
IS_WINDOWS,
run_tests,
TEST_WITH_TSAN,
TestCase
)
def _create_c10d_store_mp(is_server, server_addr, port, world_size, wait_for_workers):
store = create_c10d_store(is_server, server_addr, port, world_size, wait_for_workers=wait_for_workers, timeout=2)
if store is None:
raise AssertionError()
store.set(f"test_key/{os.getpid()}", "test_value".encode("UTF-8"))
if IS_WINDOWS or IS_MACOS:
print("tests incompatible with tsan or asan", file=sys.stderr)
sys.exit(0)
class DistributedUtilTest(TestCase):
def test_create_store_single_server(self):
store = create_c10d_store(is_server=True, server_addr=socket.gethostname())
self.assertIsNotNone(store)
def test_create_store_no_port_multi(self):
with self.assertRaises(ValueError):
create_c10d_store(
is_server=True, server_addr=socket.gethostname(), world_size=2
)
@unittest.skipIf(TEST_WITH_TSAN, "test incompatible with tsan")
def test_create_store_multi(self):
world_size = 3
wait_for_workers = False
localhost = socket.gethostname()
# start the server on the main process using an available port
store = create_c10d_store(
is_server=True,
server_addr=localhost,
server_port=0,
timeout=2,
world_size=world_size,
wait_for_workers=wait_for_workers,
)
# worker processes will use the port that was assigned to the server
server_port = store.port
worker0 = mp.Process(
target=_create_c10d_store_mp,
args=(False, localhost, server_port, world_size, wait_for_workers),
)
worker1 = mp.Process(
target=_create_c10d_store_mp,
args=(False, localhost, server_port, world_size, wait_for_workers),
)
worker0.start()
worker1.start()
worker0.join()
worker1.join()
# check test_key/pid == "test_value"
self.assertEqual(
"test_value", store.get(f"test_key/{worker0.pid}").decode("UTF-8")
)
self.assertEqual(
"test_value", store.get(f"test_key/{worker1.pid}").decode("UTF-8")
)
self.assertEqual(0, worker0.exitcode)
self.assertEqual(0, worker1.exitcode)
def test_create_store_timeout_on_server(self):
with self.assertRaises(TimeoutError):
# use any available port (port 0) since timeout is expected
create_c10d_store(
is_server=True,
server_addr=socket.gethostname(),
server_port=0,
world_size=2,
timeout=1,
)
def test_create_store_timeout_on_worker(self):
with self.assertRaises(TimeoutError):
# use any available port (port 0) since timeout is expected
create_c10d_store(
is_server=False,
server_addr=socket.gethostname(),
server_port=0,
world_size=2,
timeout=1,
)
def test_port_already_in_use_on_server(self):
# try to create the TCPStore server twice on the same port
# the second should fail due to a port conflict
# first store binds onto a free port
# try creating the second store on the port that the first store binded to
server_addr = socket.gethostname()
pick_free_port = 0
store1 = create_c10d_store(
is_server=True,
server_addr=server_addr,
server_port=pick_free_port,
timeout=1,
)
with self.assertRaises(RuntimeError):
create_c10d_store(
is_server=True, server_addr=server_addr, server_port=store1.port
)
def test_port_already_in_use_on_worker(self):
sock = get_socket_with_port()
with closing(sock):
port = sock.getsockname()[1]
# on the worker port conflict shouldn't matter, it should just timeout
# since we never created a server
with self.assertRaises(TimeoutError):
create_c10d_store(
is_server=False,
server_addr=socket.gethostname(),
server_port=port,
timeout=1,
)
if __name__ == "__main__":
run_tests()