pytorch/torch/distributed/elastic/utils/distributed.py
Can Balioglu 6e640a0acf Revise the socket implementation of c10d (#68226)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/68226

**Note that this PR is unusually big due to the urgency of the changes. Please reach out to me in case you wish to have a "pair" review.**

This PR introduces a major refactoring of the socket implementation of the C10d library. A big portion of the logic is now contained in the `Socket` class and a follow-up PR will further consolidate the remaining parts. As of today the changes in this PR offer:

 - significantly better error handling and much more verbose logging (see the example output below)
 - explicit support for IPv6 and dual-stack sockets
 - correct handling of signal interrupts
 - better Windows support

A follow-up PR will consolidate `send`/`recv` logic into `Socket` and fully migrate to non-blocking sockets.

## Example Output

```
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[W logging.h:28] The server socket on [localhost]:29501 is not yet listening (Error: 111 - Connection refused), retrying...
[I logging.h:21] The server socket will attempt to listen on an IPv6 address.
[I logging.h:21] The server socket is attempting to listen on [::]:29501.
[I logging.h:21] The server socket has started to listen on [::]:29501.
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42650.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42650.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42722.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42722.
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42724.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42724.
[I logging.h:21] The client socket will attempt to connect to an IPv6 address on (127.0.0.1, 29501).
[I logging.h:21] The client socket is attempting to connect to [localhost]:29501.
[I logging.h:21] The client socket has connected to [localhost]:29501 on [localhost]:42726.
[I logging.h:21] The server socket on [::]:29501 has accepted a connection from [localhost]:42726.
```
ghstack-source-id: 143501987

Test Plan: Run existing unit and integration tests on devserver, Fedora, Ubuntu, macOS Big Sur, Windows 10.

Reviewed By: Babar, wilson100hong, mrshenli

Differential Revision: D32372333

fbshipit-source-id: 2204ffa28ed0d3683a9cb3ebe1ea8d92a831325a
2021-11-16 20:49:25 -08:00

144 lines
4.5 KiB
Python

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import datetime
import socket
from contextlib import closing
import torch.distributed as dist
from torch.distributed.elastic.utils.logging import get_logger
log = get_logger()
_ADDRESS_IN_USE = "Address already in use"
_SOCKET_TIMEOUT = "Socket Timeout"
_MEMBER_CHECKIN = "_tcp_store/num_members"
_LAST_MEMBER_CHECKIN = "_tcp_store/last_member"
def create_c10d_store(
is_server: bool,
server_addr: str,
server_port: int = -1,
world_size: int = 1,
timeout: float = (60 * 10), # 10 min
wait_for_workers: bool = True,
retries=3,
):
if server_port == -1 and world_size > 1:
raise ValueError(
f"server_port must be specified when world_size > 1, got server_port={server_port}, world_size={world_size}"
)
if server_port != -1:
log.info(f"sever_port: {server_port}, specified, ignoring retries")
# only retry when server_port is NOT static
attempt = retries if server_port == -1 else 1
while True:
if server_port != -1:
port = server_port
else:
port = get_free_port()
log.info(
f"Creating c10d store on {server_addr}:{port}\n"
f" world_size : {world_size}\n"
f" is_server : {is_server}\n"
f" timeout(sec): {timeout}\n"
)
try:
store = dist.TCPStore(
host_name=server_addr,
port=port,
world_size=world_size,
is_master=is_server,
timeout=datetime.timedelta(seconds=timeout),
wait_for_workers=wait_for_workers,
)
# skips full rank check when we don't have to wait for all workers
if wait_for_workers:
_check_full_rank(store, world_size)
log.info("Successfully created c10d store")
return store
except RuntimeError as e:
# this is brittle, but the underlying exception type is not properly pybinded
# so we parse the error msg for now, interestingly this is how torch itself
# detects timeouts and port conflicts in their own unittests
# see - caffe2/torch/testing/_internal/common_utils.py
# TODO properly map the exceptions in pybind (c10d/init.cpp)
if str(e) == _ADDRESS_IN_USE: # this will only happen on the server
if attempt < retries:
log.warning(
f"port: {port} already in use, attempt: [{attempt}/{retries}]"
)
attempt += 1
else:
raise RuntimeError(
f"on {server_addr}, port: {port} already in use"
) from e
else:
raise
def _check_full_rank(store, world_size):
idx = store.add(_MEMBER_CHECKIN, 1)
if idx == world_size:
store.set(_LAST_MEMBER_CHECKIN, "<val_ignored>")
try:
store.get(_LAST_MEMBER_CHECKIN)
except RuntimeError as e:
if str(e) == _SOCKET_TIMEOUT:
raise TimeoutError(
f"timed out waiting for all {world_size} members to join"
) from e
else:
raise
def get_free_port():
sock = get_socket_with_port()
with closing(sock):
return sock.getsockname()[1]
def get_socket_with_port() -> socket.socket:
"""
Returns a free port on localhost that is "reserved" by binding a temporary
socket on it. Close the socket before passing the port to the entity
that requires it. Usage example
::
sock = _get_socket_with_port()
with closing(sock):
port = sock.getsockname()[1]
sock.close()
# there is still a race-condition that some other process
# may grab this port before func() runs
func(port)
"""
addrs = socket.getaddrinfo(
host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM
)
for addr in addrs:
family, type, proto, _, _ = addr
s = socket.socket(family, type, proto)
try:
s.bind(("localhost", 0))
s.listen(0)
return s
except OSError as e:
s.close()
log.info("Socket creation attempt failed.", exc_info=e)
raise RuntimeError("Failed to create a socket")