mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Fixes #112639 ```txt torch/utils/_sympy/value_ranges.py torch/utils/_sympy/value_ranges.py:60 in public class `ValueRanges`: D101: Missing docstring in public class torch/utils/_sympy/value_ranges.py:68 in public method `__init__`: D107: Missing docstring in __init__ torch/utils/_sympy/value_ranges.py:81 in public method `__contains__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:86 in public method `tighten`: D400: First line should end with a period (not 'n') torch/utils/_sympy/value_ranges.py:90 in public method `__and__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:103 in public method `__or__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:113 in public method `is_singleton`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:118 in public method `unknown`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:122 in public method `wrap`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:129 in public method `increasing_map`: D400: First line should end with a period (not ')') torch/utils/_sympy/value_ranges.py:135 in public method `decreasing_map`: D400: First line should end with a period (not ')') torch/utils/_sympy/value_ranges.py:141 in public method `monotone_map`: D400: First line should end with a period (not 'g') torch/utils/_sympy/value_ranges.py:149 in public method `convex_min_zero_map`: D400: First line should end with a period (not '0') torch/utils/_sympy/value_ranges.py:149 in public method `convex_min_zero_map`: D403: First word of the first line should be properly capitalized ('Fn', not 'fn') torch/utils/_sympy/value_ranges.py:158 in public method `coordinatewise_increasing_map`: D205: 1 blank line required between summary line and description (found 0) torch/utils/_sympy/value_ranges.py:158 in public method `coordinatewise_increasing_map`: D400: First line should end with a period (not ':') torch/utils/_sympy/value_ranges.py:171 in public method `coordinatewise_monotone_map`: D400: First line should end with a period (not 'e') torch/utils/_sympy/value_ranges.py:180 in private class `SymPyValueRangeAnalysis`: D205: 1 blank line required between summary line and description (found 0) torch/utils/_sympy/value_ranges.py:180 in private class `SymPyValueRangeAnalysis`: D400: First line should end with a period (not 's') torch/utils/_sympy/value_ranges.py:386 in private method `reciprocal`: D210: No whitespaces allowed surrounding docstring text torch/utils/_sympy/value_ranges.py:386 in private method `reciprocal`: D400: First line should end with a period (not 'n') torch/utils/_sympy/value_ranges.py:488 in public class `ValueRangeAnalysis`: D101: Missing docstring in public class torch/utils/_sympy/value_ranges.py:489 in public method `__init__`: D107: Missing docstring in __init__ torch/utils/_sympy/value_ranges.py:501 in public method `bool_handler`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:506 in public method `default_handler`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:511 in public method `load`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:514 in public method `store`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:517 in public method `reduction`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:520 in public method `index_expr`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:525 in public method `to_dtype`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:558 in public method `square`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:562 in public method `neg`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:566 in public method `truncdiv`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:577 in public method `sub`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:580 in public method `__getattr__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:585 in public function `bound_sympy`: D103: Missing docstring in public function 36 torch/utils/_sympy/value_ranges.py:60 in public class `ValueRanges`: D101: Missing docstring in public class torch/utils/_sympy/value_ranges.py:68 in public method `__init__`: D107: Missing docstring in __init__ torch/utils/_sympy/value_ranges.py:81 in public method `__contains__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:86 in public method `tighten`: D400: First line should end with a period (not 'n') torch/utils/_sympy/value_ranges.py:90 in public method `__and__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:103 in public method `__or__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:113 in public method `is_singleton`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:118 in public method `unknown`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:122 in public method `wrap`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:182 in private class `SymPyValueRangeAnalysis`: D205: 1 blank line required between summary line and description (found 0) torch/utils/_sympy/value_ranges.py:182 in private class `SymPyValueRangeAnalysis`: D400: First line should end with a period (not 's') torch/utils/_sympy/value_ranges.py:388 in private method `reciprocal`: D210: No whitespaces allowed surrounding docstring text torch/utils/_sympy/value_ranges.py:388 in private method `reciprocal`: D400: First line should end with a period (not 'n') torch/utils/_sympy/value_ranges.py:490 in public class `ValueRangeAnalysis`: D101: Missing docstring in public class torch/utils/_sympy/value_ranges.py:491 in public method `__init__`: D107: Missing docstring in __init__ torch/utils/_sympy/value_ranges.py:503 in public method `bool_handler`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:508 in public method `default_handler`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:513 in public method `load`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:516 in public method `store`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:519 in public method `reduction`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:522 in public method `index_expr`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:527 in public method `to_dtype`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:560 in public method `square`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:564 in public method `neg`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:568 in public method `truncdiv`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:579 in public method `sub`: D102: Missing docstring in public method torch/utils/_sympy/value_ranges.py:582 in public method `__getattr__`: D105: Missing docstring in magic method torch/utils/_sympy/value_ranges.py:587 in public function `bound_sympy`: D103: Missing docstring in public function 28 torch/utils/viz/_cycles.py torch/utils/viz/_cycles.py:14 in public function `observe_garbage`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:207 in public function `object_annotation`: D205: 1 blank line required between summary line and description (found 0) torch/utils/viz/_cycles.py:207 in public function `object_annotation`: D400: First line should end with a period (not 'g') torch/utils/viz/_cycles.py:256 in public class `Node`: D101: Missing docstring in public class torch/utils/viz/_cycles.py:262 in public function `create_graph`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:308 in public function `escape`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:312 in public function `is_cuda_tensor`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:315 in public function `cuda_allocation_context`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:335 in public function `to_dot`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:406 in public function `to_html`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:416 in public function `observe_tensor_cycles`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:429 in public function `warn_tensor_cycles`: D205: 1 blank line required between summary line and description (found 0) torch/utils/viz/_cycles.py:429 in public function `warn_tensor_cycles`: D400: First line should end with a period (not 'p') torch/utils/viz/_cycles.py:429 in public function `warn_tensor_cycles`: D401: First line should be in imperative mood; try rephrasing (found 'Reference') 14 torch/utils/viz/_cycles.py:14 in public function `observe_garbage`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:256 in public class `Node`: D101: Missing docstring in public class torch/utils/viz/_cycles.py:262 in public function `create_graph`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:308 in public function `escape`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:312 in public function `is_cuda_tensor`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:315 in public function `cuda_allocation_context`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:335 in public function `to_dot`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:406 in public function `to_html`: D103: Missing docstring in public function torch/utils/viz/_cycles.py:416 in public function `observe_tensor_cycles`: D103: Missing docstring in public function 9 torch/distributed/argparse_util.py torch/distributed/argparse_util.py:1 at module level: D100: Missing docstring in public module torch/distributed/argparse_util.py:13 in public class `env`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/argparse_util.py:13 in public class `env`: D400: First line should end with a period (not 'g') torch/distributed/argparse_util.py:13 in public class `env`: D412: No blank lines allowed between a section header and its content ('Example') torch/distributed/argparse_util.py:43 in public method `__init__`: D107: Missing docstring in __init__ torch/distributed/argparse_util.py:56 in public method `__call__`: D102: Missing docstring in public method torch/distributed/argparse_util.py:61 in public class `check_env`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/argparse_util.py:61 in public class `check_env`: D400: First line should end with a period (not 's') torch/distributed/argparse_util.py:61 in public class `check_env`: D412: No blank lines allowed between a section header and its content ('Example') torch/distributed/argparse_util.py:97 in public method `__init__`: D107: Missing docstring in __init__ torch/distributed/argparse_util.py:102 in public method `__call__`: D102: Missing docstring in public method 11 torch/distributed/argparse_util.py:1 at module level: D100: Missing docstring in public module torch/distributed/argparse_util.py:43 in public method `__init__`: D107: Missing docstring in __init__ torch/distributed/argparse_util.py:56 in public method `__call__`: D102: Missing docstring in public method torch/distributed/argparse_util.py:97 in public method `__init__`: D107: Missing docstring in __init__ torch/distributed/argparse_util.py:102 in public method `__call__`: D102: Missing docstring in public method 5 torch/distributed/_composable_state.py torch/distributed/_composable_state.py:20 in private function `_get_module_state`: D202: No blank lines allowed after function docstring (found 1) torch/distributed/_composable_state.py:20 in private function `_get_module_state`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/_composable_state.py:20 in private function `_get_module_state`: D400: First line should end with a period (not '`') 3 0 torch/distributed/launch.py torch/distributed/launch.py:1 at module level: D205: 1 blank line required between summary line and description (found 0) torch/distributed/launch.py:1 at module level: D400: First line should end with a period (not 'd') torch/distributed/launch.py:156 in public function `parse_args`: D103: Missing docstring in public function torch/distributed/launch.py:171 in public function `launch`: D103: Missing docstring in public function torch/distributed/launch.py:180 in public function `main`: D103: Missing docstring in public function 5 torch/distributed/launch.py:157 in public function `parse_args`: D103: Missing docstring in public function torch/distributed/launch.py:172 in public function `launch`: D103: Missing docstring in public function torch/distributed/launch.py:181 in public function `main`: D103: Missing docstring in public function 3 torch/distributed/remote_device.py torch/distributed/remote_device.py:1 at module level: D100: Missing docstring in public module torch/distributed/remote_device.py:81 in private method `worker_name`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/remote_device.py:81 in private method `worker_name`: D401: First line should be in imperative mood (perhaps 'Return', not 'Returns') torch/distributed/remote_device.py:88 in private method `rank`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/remote_device.py:88 in private method `rank`: D401: First line should be in imperative mood (perhaps 'Return', not 'Returns') torch/distributed/remote_device.py:95 in private method `device`: D200: One-line docstring should fit on one line with quotes (found 3) torch/distributed/remote_device.py:95 in private method `device`: D401: First line should be in imperative mood (perhaps 'Return', not 'Returns') 7 torch/distributed/remote_device.py:1 at module level: D100: Missing docstring in public module torch/distributed/remote_device.py:85 in private method `rank`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/remote_device.py:85 in private method `rank`: D401: First line should be in imperative mood (perhaps 'Return', not 'Returns') 3 torch/distributed/rendezvous.py torch/distributed/rendezvous.py:1 at module level: D100: Missing docstring in public module torch/distributed/rendezvous.py:23 in public function `register_rendezvous_handler`: D401: First line should be in imperative mood (perhaps 'Register', not 'Registers') torch/distributed/rendezvous.py:88 in public function `rendezvous`: D103: Missing docstring in public function torch/distributed/rendezvous.py:147 in private function `_create_c10d_store`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/rendezvous.py:147 in private function `_create_c10d_store`: D400: First line should end with a period (not 'r') 5 torch/distributed/rendezvous.py:1 at module level: D100: Missing docstring in public module torch/distributed/rendezvous.py:89 in public function `rendezvous`: D103: Missing docstring in public function 2 torch/distributed/run.py torch/distributed/run.py:9 at module level: D205: 1 blank line required between summary line and description (found 0) torch/distributed/run.py:9 at module level: D400: First line should end with a period (not '`') torch/distributed/run.py:393 in public function `get_args_parser`: D202: No blank lines allowed after function docstring (found 1) torch/distributed/run.py:393 in public function `get_args_parser`: D401: First line should be in imperative mood; try rephrasing (found 'Helper') torch/distributed/run.py:610 in public function `parse_args`: D103: Missing docstring in public function torch/distributed/run.py:615 in public function `parse_min_max_nnodes`: D103: Missing docstring in public function torch/distributed/run.py:629 in public function `determine_local_world_size`: D103: Missing docstring in public function torch/distributed/run.py:670 in public function `get_rdzv_endpoint`: D103: Missing docstring in public function torch/distributed/run.py:677 in public function `get_use_env`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/run.py:677 in public function `get_use_env`: D401: First line should be in imperative mood (perhaps 'Retrieve', not 'Retrieves') torch/distributed/run.py:689 in public function `config_from_args`: D103: Missing docstring in public function torch/distributed/run.py:770 in public function `run_script_path`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/run.py:770 in public function `run_script_path`: D401: First line should be in imperative mood (perhaps 'Run', not 'Runs') torch/distributed/run.py:781 in public function `run`: D103: Missing docstring in public function torch/distributed/run.py:804 in public function `main`: D103: Missing docstring in public function 15 torch/distributed/run.py:611 in public function `parse_args`: D103: Missing docstring in public function torch/distributed/run.py:616 in public function `parse_min_max_nnodes`: D103: Missing docstring in public function torch/distributed/run.py:630 in public function `determine_local_world_size`: D103: Missing docstring in public function torch/distributed/run.py:671 in public function `get_rdzv_endpoint`: D103: Missing docstring in public function torch/distributed/run.py:691 in public function `config_from_args`: D103: Missing docstring in public function torch/distributed/run.py:784 in public function `run`: D103: Missing docstring in public function torch/distributed/run.py:807 in public function `main`: D103: Missing docstring in public function 7 torch/distributed/__init__.py torch/distributed/__init__.py:1 at module level: D104: Missing docstring in public package torch/distributed/__init__.py:8 in public function `is_available`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/__init__.py:8 in public function `is_available`: D400: First line should end with a period (not ',') torch/distributed/__init__.py:8 in public function `is_available`: D401: First line should be in imperative mood (perhaps 'Return', not 'Returns') 4 torch/distributed/__init__.py:1 at module level: D104: Missing docstring in public package 1 torch/distributed/utils.py:1 at module level: D100: Missing docstring in public module torch/distributed/utils.py:16 in private function `_pack_kwargs`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/utils.py:16 in private function `_pack_kwargs`: D400: First line should end with a period (not ')') torch/distributed/utils.py:47 in private function `_cast_forward_inputs`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/utils.py:88 in private function `_recursive_to`: D200: One-line docstring should fit on one line with quotes (found 3) torch/distributed/utils.py:141 in private function `_p_assert`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/utils.py:141 in private function `_p_assert`: D209: Multi-line docstring closing quotes should be on a separate line torch/distributed/utils.py:141 in private function `_p_assert`: D400: First line should end with a period (not 't') torch/distributed/utils.py:141 in private function `_p_assert`: D401: First line should be in imperative mood; try rephrasing (found 'This') torch/distributed/utils.py:275 in private function `_sync_module_states`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/utils.py:275 in private function `_sync_module_states`: D400: First line should end with a period (not 'n') torch/distributed/utils.py:275 in private function `_sync_module_states`: D401: First line should be in imperative mood (perhaps 'Sync', not 'Syncs') torch/distributed/utils.py:300 in private function `_sync_params_and_buffers`: D205: 1 blank line required between summary line and description (found 0) torch/distributed/utils.py:300 in private function `_sync_params_and_buffers`: D400: First line should end with a period (not 'y') torch/distributed/utils.py:300 in private function `_sync_params_and_buffers`: D401: First line should be in imperative mood (perhaps 'Synchronize', not 'Synchronizes') 15 torch/distributed/utils.py:1 at module level: D100: Missing docstring in public module 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/112953 Approved by: https://github.com/weifengpy
257 lines
9.0 KiB
Python
257 lines
9.0 KiB
Python
try:
|
|
from urllib.parse import urlparse, urlunparse
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"urllib cannot be found, urlparse from python2 is no longer supported."
|
|
) from e
|
|
|
|
import numbers
|
|
import os
|
|
import sys
|
|
from datetime import timedelta
|
|
from typing import Dict, Optional
|
|
|
|
from torch.distributed import FileStore, PrefixStore, Store, TCPStore
|
|
|
|
from .constants import default_pg_timeout
|
|
|
|
|
|
_rendezvous_handlers = {}
|
|
|
|
|
|
def register_rendezvous_handler(scheme, handler):
|
|
"""
|
|
Register a new rendezvous handler.
|
|
|
|
Before we can run collective algorithms, participating processes
|
|
need to find each other and exchange information to be able to
|
|
communicate. We call this process rendezvous.
|
|
|
|
The outcome of the rendezvous process is a triplet containing a
|
|
shared key/value store, the rank of the process, and the total
|
|
number of participating processes.
|
|
|
|
If none of the bundled rendezvous methods apply to your execution
|
|
environment you can opt to register your own rendezvous handler.
|
|
Pick a unique name and use the URL scheme to identify it when
|
|
calling the `rendezvous()` function.
|
|
|
|
Args:
|
|
scheme (str): URL scheme to identify your rendezvous handler.
|
|
handler (function): Handler that is invoked when the
|
|
`rendezvous()` function is called with a URL that uses
|
|
the corresponding scheme. It must be a generator function
|
|
that yields the triplet.
|
|
"""
|
|
global _rendezvous_handlers
|
|
if scheme in _rendezvous_handlers:
|
|
raise RuntimeError(
|
|
f"Rendezvous handler for {scheme}:// already registered"
|
|
)
|
|
_rendezvous_handlers[scheme] = handler
|
|
|
|
|
|
# Query will have format "rank=0&world_size=1" and is
|
|
# converted into {"rank": 0, "world_size": 1}
|
|
def _query_to_dict(query: str) -> Dict[str, str]:
|
|
return {pair[0]: pair[1] for pair in (pair.split("=") for pair in filter(None, query.split("&")))}
|
|
|
|
|
|
def _rendezvous_helper(url: str, rank: int, world_size_opt: Optional[int], **kwargs):
|
|
result = urlparse(url)
|
|
if world_size_opt is None:
|
|
world_size = -1
|
|
if result.scheme == "env":
|
|
rank = int(os.environ.get("RANK", rank))
|
|
# If the world_size env variable is not present then it is a dynamic group
|
|
world_size = int(os.environ.get("WORLD_SIZE", world_size))
|
|
else:
|
|
world_size = world_size_opt
|
|
if rank != -1 or world_size != -1 or world_size_opt is None:
|
|
query_dict = _query_to_dict(result.query)
|
|
assert (
|
|
"rank" not in query_dict and "world_size" not in query_dict
|
|
), f"The url: {url} has node-specific arguments(rank, world_size) already."
|
|
if rank != -1:
|
|
query_dict["rank"] = str(rank)
|
|
if world_size != -1 or world_size_opt is None:
|
|
query_dict["world_size"] = str(world_size)
|
|
result = result._replace(
|
|
query=f"{'&'.join([f'{k}={v}' for k, v in query_dict.items()])}"
|
|
)
|
|
url = urlunparse(result)
|
|
|
|
if result.scheme not in _rendezvous_handlers:
|
|
raise RuntimeError(f"No rendezvous handler for {result.scheme}://")
|
|
return _rendezvous_handlers[result.scheme](url, **kwargs)
|
|
|
|
|
|
def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
|
|
if not isinstance(url, (str, bytes)):
|
|
raise RuntimeError(f"`url` must be a string. {type(url)}: {url}")
|
|
|
|
if not isinstance(rank, numbers.Integral):
|
|
raise RuntimeError(f"`rank` must be an integer. {rank}")
|
|
|
|
if not isinstance(world_size, numbers.Integral):
|
|
raise RuntimeError(f"`world_size` must be an integer. {world_size}")
|
|
|
|
return _rendezvous_helper(url, rank, world_size, **kwargs)
|
|
|
|
|
|
def _create_store_from_options(backend_options, rank):
|
|
store, _, _ = next(_rendezvous_helper(backend_options.init_method, rank, None))
|
|
return store
|
|
|
|
|
|
def _rendezvous_error(msg):
|
|
return ValueError("Error initializing torch.distributed using " + msg)
|
|
|
|
|
|
def _file_rendezvous_handler(url: str, **kwargs):
|
|
def _error(msg):
|
|
return _rendezvous_error("file:// rendezvous: " + msg)
|
|
|
|
result = urlparse(url)
|
|
path = result.path
|
|
if sys.platform == "win32":
|
|
import urllib.request
|
|
|
|
full_path = result.netloc + result.path
|
|
path = urllib.request.url2pathname(full_path)
|
|
if path:
|
|
# Normalizing an empty string produces ".", which is not expected.
|
|
path = os.path.normpath(path)
|
|
|
|
if not path:
|
|
raise _error("path missing")
|
|
query_dict = _query_to_dict(result.query)
|
|
if "rank" not in query_dict:
|
|
raise _error("rank parameter missing")
|
|
if "world_size" not in query_dict:
|
|
raise _error("world size parameter missing")
|
|
|
|
rank = int(query_dict["rank"])
|
|
world_size = int(query_dict["world_size"])
|
|
store = FileStore(path, world_size)
|
|
yield (store, rank, world_size)
|
|
|
|
# If this configuration is invalidated, there is nothing we can do about it
|
|
raise RuntimeError("Unable to perform rerendezvous using file:// method")
|
|
|
|
|
|
def _torchelastic_use_agent_store() -> bool:
|
|
return os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None) == str(True)
|
|
|
|
|
|
def _create_c10d_store(hostname, port, rank, world_size, timeout, use_libuv=False) -> Store:
|
|
"""
|
|
Smartly creates a c10d Store object on ``rank`` based on whether we need to re-use agent store.
|
|
|
|
The TCPStore server is assumed to be hosted
|
|
on ``hostname:port``.
|
|
|
|
If ``torchelastic_use_agent_store()`` is ``True``, then it is assumed that
|
|
the agent leader (node rank 0) hosts the TCPStore server (for which the
|
|
endpoint is specified by the given ``hostname:port``). Hence
|
|
ALL ranks will create and return a TCPStore client (e.g. ``start_daemon=False``).
|
|
|
|
If ``torchelastic_use_agent_store()`` is ``False``, then rank 0 will host
|
|
the TCPStore (with multi-tenancy) and it is assumed that rank 0's hostname
|
|
and port are correctly passed via ``hostname`` and ``port``. All
|
|
non-zero ranks will create and return a TCPStore client.
|
|
"""
|
|
# check if port is uint16_t
|
|
if not 0 <= port < 2**16:
|
|
raise ValueError(f"port must have value from 0 to 65535 but was {port}.")
|
|
|
|
if _torchelastic_use_agent_store():
|
|
attempt = os.environ["TORCHELASTIC_RESTART_COUNT"]
|
|
tcp_store = TCPStore(hostname, port, world_size, False, timeout)
|
|
return PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
|
|
else:
|
|
start_daemon = rank == 0
|
|
return TCPStore(
|
|
hostname, port, world_size, start_daemon, timeout, multi_tenant=True, use_libuv=use_libuv
|
|
)
|
|
|
|
|
|
def _tcp_rendezvous_handler(
|
|
url: str, timeout: timedelta = default_pg_timeout, **kwargs
|
|
):
|
|
def _error(msg):
|
|
return _rendezvous_error("tcp:// rendezvous: " + msg)
|
|
|
|
result = urlparse(url)
|
|
if not result.port:
|
|
raise _error("port number missing")
|
|
query_dict = _query_to_dict(result.query)
|
|
if "rank" not in query_dict:
|
|
raise _error("rank parameter missing")
|
|
if "world_size" not in query_dict:
|
|
raise _error("world size parameter missing")
|
|
|
|
rank = int(query_dict["rank"])
|
|
world_size = int(query_dict["world_size"])
|
|
use_libuv = query_dict.get("use_libuv", "0") == "1"
|
|
assert result.hostname is not None
|
|
|
|
store = _create_c10d_store(result.hostname, result.port, rank, world_size, timeout, use_libuv)
|
|
|
|
yield (store, rank, world_size)
|
|
|
|
# If this configuration is invalidated, there is nothing we can do about it
|
|
raise RuntimeError("Unable to perform re-rendezvous using tcp:// method")
|
|
|
|
|
|
def _env_rendezvous_handler(
|
|
url: str, timeout: timedelta = default_pg_timeout, **kwargs
|
|
):
|
|
def _error(msg):
|
|
return _rendezvous_error("env:// rendezvous: " + msg)
|
|
|
|
def _env_error(var):
|
|
return _error(f"environment variable {var} expected, but not set")
|
|
|
|
def _get_env_or_raise(env_var: str) -> str:
|
|
env_val = os.environ.get(env_var, None)
|
|
if not env_val:
|
|
raise _env_error(env_var)
|
|
else:
|
|
return env_val
|
|
|
|
result = urlparse(url)
|
|
query_dict = _query_to_dict(result.query)
|
|
|
|
rank: int
|
|
world_size: int
|
|
master_port: int
|
|
master_addr: str
|
|
|
|
if "rank" in query_dict:
|
|
rank = int(query_dict["rank"])
|
|
else:
|
|
rank = int(_get_env_or_raise("RANK"))
|
|
|
|
if "world_size" in query_dict:
|
|
world_size = int(query_dict["world_size"])
|
|
else:
|
|
world_size = int(_get_env_or_raise("WORLD_SIZE"))
|
|
|
|
|
|
master_addr = _get_env_or_raise("MASTER_ADDR")
|
|
master_port = int(_get_env_or_raise("MASTER_PORT"))
|
|
use_libuv = query_dict.get("use_libuv", os.environ.get("USE_LIBUV", "0")) == "1"
|
|
|
|
store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout, use_libuv)
|
|
|
|
yield (store, rank, world_size)
|
|
|
|
# If this configuration is invalidated, there is nothing we can do about it
|
|
raise RuntimeError("Unable to perform re-rendezvous using env:// method")
|
|
|
|
|
|
register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
|
|
register_rendezvous_handler("env", _env_rendezvous_handler)
|
|
register_rendezvous_handler("file", _file_rendezvous_handler)
|