[TorchElastic] Option to enable TCPStore libuv backed (#124684)

Summary:
Libuv backed isn't enabled in PTD by default now. Add an option to enable libuv backed to improve scaling of the rendezvous process.
Tries not to make assumption on the default libuv settings in TCPStore since it may change in the next release.

Test Plan: CI

Differential Revision: D56435815

Pull Request resolved: https://github.com/pytorch/pytorch/pull/124684
Approved by: https://github.com/d4l3k, https://github.com/XilunWu
This commit is contained in:
Kurman Karabukaev 2024-04-23 23:12:17 +00:00 committed by PyTorch MergeBot
parent 3999b72d46
commit 1c4ad87396
2 changed files with 37 additions and 1 deletions

View File

@ -126,6 +126,33 @@ class DistributedUtilTest(TestCase):
timeout=1,
)
def test_create_store_with_libuv_support(self):
world_size = 1
wait_for_workers = False
localhost = socket.gethostname()
store = create_c10d_store(
is_server=True,
server_addr=localhost,
server_port=0,
timeout=2,
world_size=world_size,
wait_for_workers=wait_for_workers,
use_libuv=False,
)
self.assertFalse(store.libuvBackend)
store = create_c10d_store(
is_server=True,
server_addr=localhost,
server_port=0,
timeout=2,
world_size=world_size,
wait_for_workers=wait_for_workers,
use_libuv=True,
)
self.assertTrue(store.libuvBackend)
def test_port_already_in_use_on_server(self):
# try to create the TCPStore server twice on the same port
# the second should fail due to a port conflict

View File

@ -6,8 +6,10 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import datetime
import functools
import socket
from contextlib import closing
from typing import Optional
import torch.distributed as dist
from torch.distributed.elastic.utils.logging import get_logger
@ -30,6 +32,7 @@ def create_c10d_store(
timeout: float = (60 * 10), # 10 min
wait_for_workers: bool = True,
retries=3,
use_libuv: Optional[bool] = None,
):
if server_port == -1 and world_size > 1:
raise ValueError(
@ -56,7 +59,8 @@ def create_c10d_store(
)
try:
store = dist.TCPStore(
store_builder = functools.partial(
dist.TCPStore,
host_name=server_addr,
port=port,
world_size=world_size,
@ -64,6 +68,11 @@ def create_c10d_store(
timeout=datetime.timedelta(seconds=timeout),
wait_for_workers=wait_for_workers,
)
if use_libuv is None:
# TCPStore default backend may change, don't specify it unless we explicity told to do so.
store = store_builder()
else:
store = store_builder(use_libuv=use_libuv)
# skips full rank check when we don't have to wait for all workers
if wait_for_workers:
_check_full_rank(store, world_size)