[BE] Prefer dash over underscore in command-line options (#94505)

Preferring dash over underscore in command-line options. Add `--command-arg-name` to the argument parser. The old arguments with underscores `--command_arg_name` are kept for backward compatibility. Both dashes and underscores are used in the PyTorch codebase. Some argument parsers only have dashes or only have underscores in arguments. For example, the `torchrun` utility for distributed training only accepts underscore arguments (e.g., `--master_port`). The dashes are more common in other command-line tools. And it looks to be the default choice in the Python standard library: `argparse.BooleanOptionalAction`: 4a9dff0e5a/Lib/argparse.py (L893-L895) ```python class BooleanOptionalAction(Action): def __init__(...): if option_string.startswith('--'): option_string = '--no-' + option_string[2:] _option_strings.append(option_string) ``` It adds `--no-argname`, not `--no_argname`. Also typing `_` need to press the shift or the caps-lock key than `-`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/94505 Approved by: https://github.com/ezyang, https://github.com/seemethere
2025-12-06 12:20:52 +01:00 · 2023-02-09 20:16:46 +00:00 · 2023-02-09 20:16:46 +00:00 · a229b4526f
commit a229b4526f
parent a63524684d
91 changed files with 631 additions and 456 deletions
--- a/benchmarks/distributed/rpc/parameter_server/launcher.py
+++ b/benchmarks/distributed/rpc/parameter_server/launcher.py
@ -448,11 +448,13 @@ def main(args):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="RPC server Benchmark")
    parser.add_argument(
        "--master-addr",
        "--master_addr",
        type=str,
        help="IP address of the machine that will host the process with rank 0"
    )
    parser.add_argument(
        "--master-port",
        "--master_port",
        type=str,
        help="A free port on the machine that will host the process with rank 0"
@ -493,6 +495,7 @@ if __name__ == "__main__":
        help="cudaserver count for benchmark run"
    )
    parser.add_argument(
        "--rpc-timeout",
        "--rpc_timeout",
        type=int,
        help="timeout in seconds to use for RPC"
@ -508,6 +511,7 @@ if __name__ == "__main__":
        help="epoch count for training"
    )
    parser.add_argument(
        "--batch-size",
        "--batch_size",
        type=int,
        help="number of training examples used in one iteration"
@ -523,62 +527,74 @@ if __name__ == "__main__":
        help="id for model configuration"
    )
    parser.add_argument(
        "--data-config-path",
        "--data_config_path",
        type=str,
        help="path to data configuration file"
    )
    parser.add_argument(
        "--model-config-path",
        "--model_config_path",
        type=str,
        help="path to model configuration file"
    )
    parser.add_argument(
        "--server-config-path",
        "--server_config_path",
        type=str,
        help="path to server configuration file"
    )
    parser.add_argument(
        "--trainer-config-path",
        "--trainer_config_path",
        type=str,
        help="path to trainer configuration file"
    )
    parser.add_argument(
        "--torch-seed",
        "--torch_seed",
        type=int,
        help="seed for generating random numbers to a non-deterministic random number"
    )
    parser.add_argument(
        "--cuda-seed",
        "--cuda_seed",
        type=int,
        help="seed for generating random numbers to a random number for the current GPU"
    )
    parser.add_argument(
        "--preprocess-data",
        "--preprocess_data",
        type=str,
        help="this function will be used to preprocess data before training"
    )
    parser.add_argument(
        "--create-criterion",
        "--create_criterion",
        type=str,
        help="this function will be used to create the criterion used for model loss calculation"
    )
    parser.add_argument(
        "--create-ddp-model",
        "--create_ddp_model",
        type=str,
        help="this function will be used to create the ddp model used during training"
    )
    parser.add_argument(
        "--hook-state",
        "--hook_state",
        type=str,
        help="this will be the state class used when registering the ddp communication hook"
    )
    parser.add_argument(
        "--ddp-hook",
        "--ddp_hook",
        type=str,
        default="allreduce_hook",
        help="ddp communication hook"
    )
    parser.add_argument(
        "--iteration-step",
        "--iteration_step",
        type=str,
        help="this will be the function called for each iteration of training"
--- a/benchmarks/distributed/rpc/rl/README.md
+++ b/benchmarks/distributed/rpc/rl/README.md
@ -20,7 +20,7 @@ This benchmark depends on PyTorch.
 For any environments you are interested in, pass the corresponding arguments to `python launcher.py`.
-```python launcher.py --world_size="10,20" --master_addr="127.0.0.1" --master_port="29501 --batch="True" --state_size="10-20-10" --nlayers="5" --out_features="10" --output_file_path="benchmark_report.json"```
+```python launcher.py --world-size="10,20" --master-addr="127.0.0.1" --master-port="29501 --batch="True" --state-size="10-20-10" --nlayers="5" --out-features="10" --output-file-path="benchmark_report.json"```
 Example Output:
--- a/benchmarks/distributed/rpc/rl/launcher.py
+++ b/benchmarks/distributed/rpc/rl/launcher.py
@ -29,15 +29,15 @@ def str2bool(v):
 parser = argparse.ArgumentParser(description='PyTorch RPC RL Benchmark')
-parser.add_argument('--world_size', type=str, default='10')
+parser.add_argument('--world-size', '--world_size', type=str, default='10')
-parser.add_argument('--master_addr', type=str, default='127.0.0.1')
+parser.add_argument('--master-addr', '--master_addr', type=str, default='127.0.0.1')
-parser.add_argument('--master_port', type=str, default='29501')
+parser.add_argument('--master-port', '--master_port', type=str, default='29501')
 parser.add_argument('--batch', type=str, default='True')
-parser.add_argument('--state_size', type=str, default='10-20-10')
+parser.add_argument('--state-size', '--state_size', type=str, default='10-20-10')
 parser.add_argument('--nlayers', type=str, default='5')
-parser.add_argument('--out_features', type=str, default='10')
+parser.add_argument('--out-features', '--out_features', type=str, default='10')
-parser.add_argument('--output_file_path', type=str, default='benchmark_report.json')
+parser.add_argument('--output-file-path', '--output_file_path', type=str, default='benchmark_report.json')
 args = parser.parse_args()
 args = vars(args)
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1520,7 +1520,9 @@ def parse_args(args=None):
        default=False,
        help="use channels last format",
    )
-    parser.add_argument("--batch_size", type=int, help="batch size for benchmarking")
+    parser.add_argument(
        "--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
    )
    parser.add_argument(
        "--iterations", type=int, default=2, help="how many iterations to run"
    )
@ -1651,7 +1653,11 @@ def parse_args(args=None):
        action="store_true",
        help="exports trace of kineto profiler",
    )
-    parser.add_argument("--profiler_trace_name", help="Overwrites exported trace name")
+    parser.add_argument(
        "--profiler-trace-name",
        "--profiler_trace_name",
        help="Overwrites exported trace name",
    )
    parser.add_argument(
        "--diff-branch",
@ -1670,6 +1676,7 @@ def parse_args(args=None):
    )
    parser.add_argument(
        "--cold-start-latency",
        "--cold_start_latency",
        action="store_true",
        help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
@ -1787,6 +1794,7 @@ def parse_args(args=None):
        help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json",
    )
    group.add_argument(
        "--recompile-profiler",
        "--recompile_profiler",
        action="store_true",
        help="Run the dynamo recompilation profiler on each model.",
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@ -121,24 +121,29 @@ if __name__ == "__main__":
        help="if set to a str, uses dynamo[str] backend. else, eager",
    )
    parser.add_argument("--verbose", action="store_true")
-    parser.add_argument("--batch_size", default=None)
+    parser.add_argument("--batch-size", "--batch_size", default=None)
    parser.add_argument(
        "--torchviz", action="store_true", help="Dump autograd graph with torchviz"
    )
    parser.add_argument("--profile", action="store_true", help="Run the profiler")
-    parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
+    parser.add_argument(
        "--trace-file", "--trace_file", default="profile.json", help="Run the profiler"
    )
    parser.add_argument("--repeat", default=10, help="Repeats for timing run")
    parser.add_argument(
        "--dynamo-no-optimize-ddp",
        "--dynamo_no_optimize_ddp",
        action="store_true",
        help="Disable dynamo's ddp optimizer (enabled by default)",
    )
    parser.add_argument(
        "--fsdp-checkpoint",
        "--fsdp_checkpoint",
        action="store_true",
        help="Use gradient checkpointing via model-specific policy",
    )
    parser.add_argument(
        "--fsdp-wrap",
        "--fsdp_wrap",
        action="store_true",
        help="Apply fsdp to submodules via model-specific policy",
@ -150,10 +155,12 @@ if __name__ == "__main__":
    model_arg = parser.add_mutually_exclusive_group(required=True)
    model_arg.add_argument(
-        "--torchbench_model", help="name of torchbench model, e.g. hf_Bert"
+        "--torchbench-model",
        "--torchbench_model",
        help="name of torchbench model, e.g. hf_Bert",
    )
    model_arg.add_argument(
-        "--toy_model", action="store_true", help="use toy model instead"
+        "--toy-model", "--toy_model", action="store_true", help="use toy model instead"
    )
    args = parser.parse_args()
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@ -13,10 +13,10 @@ This command will generate the commands for the default compilers (see DEFAULTS
 below) for inference, run them and visualize the logs.
 If you want to just print the commands, you could use the following command
-> python benchmarks/runner.py --print_run_commands --suites=torchbench --inference
+-> python benchmarks/runner.py --print-run-commands --suites=torchbench --inference
 Similarly, if you want to just visualize the already finished logs
-> python benchmarks/runner.py --visualize_logs --suites=torchbench --inference
+-> python benchmarks/runner.py --visualize-logs --suites=torchbench --inference
 If you want to test float16
 -> python benchmarks/runner.py --suites=torchbench --inference --dtypes=float16
@ -178,11 +178,13 @@ def parse_args():
    # Choose either generation of commands, pretty parsing or e2e runs
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument(
        "--print-run-commands",
        "--print_run_commands",
        action="store_true",
        help="Generate commands and saves them to run.sh",
    )
    group.add_argument(
        "--visualize-logs",
        "--visualize_logs",
        action="store_true",
        help="Pretty print the log files and draw graphs",
@ -265,7 +267,11 @@ def parse_args():
        help="Github CLI path",
    )
    parser.add_argument(
-        "--batch_size", type=int, default=None, help="batch size for benchmarking"
+        "--batch-size",
        "--batch_size",
        type=int,
        default=None,
        help="batch size for benchmarking",
    )
    parser.add_argument(
        "--threads",
@ -276,12 +282,14 @@ def parse_args():
    )
    launcher_group = parser.add_argument_group("CPU Launcher Parameters")
    launcher_group.add_argument(
        "--enable-cpu-launcher",
        "--enable_cpu_launcher",
        action="store_true",
        default=False,
        help="Use torch.backends.xeon.run_cpu to get the peak performance on Intel(R) Xeon(R) Scalable Processors.",
    )
    launcher_group.add_argument(
        "--cpu-launcher-args",
        "--cpu_launcher_args",
        type=str,
        default="",
@ -370,10 +378,10 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                        "inductor",
                        "inductor_no_cudagraphs",
                    ):
-                        cmd = f"{cmd} --cold_start_latency"
+                        cmd = f"{cmd} --cold-start-latency"
                    if args.batch_size is not None:
-                        cmd = f"{cmd} --batch_size {args.batch_size}"
+                        cmd = f"{cmd} --batch-size {args.batch_size}"
                    if args.threads is not None:
                        cmd = f"{cmd} --threads {args.threads}"
--- a/benchmarks/dynamo/test.py
+++ b/benchmarks/dynamo/test.py
@ -36,7 +36,7 @@ class TestDynamoBenchmark(unittest.TestCase):
                    "--performance",
                    "--only=BERT_pytorch",
                    "-n1",
-                    "--batch_size=1",
+                    "--batch-size=1",
                ]
            )
            run(TorchBenchmarkRunner(), args, original_dir)
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@ -209,7 +209,7 @@ if __name__ == '__main__':
    parser.add_argument('--warmup', default='10', type=int)
    parser.add_argument('--nloops', default='100', type=int)
    parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--variable_lstms', action='store_true',
+    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true',
                        help='Also benchmark variable sequence length lstms '
                        'Note that some of these run really slowly '
                        'and that the `seqLength` flag will be ignored.')
@ -224,9 +224,9 @@ if __name__ == '__main__':
                        help='The fuser backend to use. One of: te, old, or none')
    parser.add_argument('--executor', default=None, type=str,
                        help='The executor to use. One of: legacy, simple, profiling')
-    parser.add_argument('--cuda_pointwise_loop_level', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-loop-level', '--cuda_pointwise_loop_level', default=None, type=int)
-    parser.add_argument('--cuda_pointwise_block_count', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-block-count', '--cuda_pointwise_block_count', default=None, type=int)
-    parser.add_argument('--cuda_pointwise_block_size', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-block-size', '--cuda_pointwise_block_size', default=None, type=int)
    args = parser.parse_args()
    set_fuser(args.fuser, args.executor)
--- a/benchmarks/fastrnns/profile.py
+++ b/benchmarks/fastrnns/profile.py
@ -95,7 +95,7 @@ def full_profile(rnns, **args):
    for k, v in args.items():
        profile_args.append('--{}={}'.format(k, v))
    profile_args.append('--rnns {}'.format(' '.join(rnns)))
-    profile_args.append('--internal_run')
+    profile_args.append('--internal-run')
    outpath = nvprof_output_filename(rnns, **args)
@ -114,7 +114,7 @@ if __name__ == '__main__':
    parser.add_argument('--inputSize', default='512', type=int)
    parser.add_argument('--hiddenSize', default='512', type=int)
    parser.add_argument('--miniBatch', default='64', type=int)
-    parser.add_argument('--sleep_between_seconds', default='1', type=int)
+    parser.add_argument('--sleep-between-seconds', '--sleep_between_seconds', default='1', type=int)
    parser.add_argument('--nloops', default='5', type=int)
    parser.add_argument('--rnns', nargs='*',
@ -122,7 +122,7 @@ if __name__ == '__main__':
    # if internal_run, we actually run the rnns.
    # if not internal_run, we shell out to nvprof with internal_run=T
-    parser.add_argument('--internal_run', default=False, action='store_true',
+    parser.add_argument('--internal-run', '--internal_run', default=False, action='store_true',
                        help='Don\'t use this')
    args = parser.parse_args()
    if args.rnns is None:
--- a/benchmarks/fastrnns/test.py
+++ b/benchmarks/fastrnns/test.py
@ -128,8 +128,8 @@ if __name__ == '__main__':
    parser.add_argument('--hiddenSize', default='512', type=int)
    parser.add_argument('--miniBatch', default='64', type=int)
    parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--check_grad', default='True', type=bool)
+    parser.add_argument('--check-grad', '--check_grad', default='True', type=bool)
-    parser.add_argument('--variable_lstms', action='store_true')
+    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true')
    parser.add_argument('--seed', default='17', type=int)
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--rnns', nargs='*',
--- a/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
+++ b/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
@ -15,12 +15,12 @@ Graph can be saved via save option. Saved in the directory where benchmark is ru
 Example build/run:
 To run PT benchmark:
 buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_benchmark --
- --add_op --graph_mode --eager_mode (Runs both graph mode and eager mode)
+ --add-op --graph-mode --eager-mode (Runs both graph mode and eager mode)
 buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_benchmark --
- --add_op --graph_mode (Runs only graph mode)
+ --add-op --graph-mode (Runs only graph mode)
 To run C2 benchmark:
 buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_benchmark --
- --add_op --benchmark_c2_net
+ --add-op --benchmark-c2-net
 """
 SUPPORTED_OPS = {"add_op"}
@ -64,13 +64,25 @@ def benchmark_simple_fn(args, config, module_config, module_type, result):
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--op", default="add_op", dest="op", type=str)
-    parser.add_argument("--benchmark_c2_net", default=False, dest="benchmark_c2_net", action="store_true")
+    parser.add_argument(
-    parser.add_argument("--use_throughput_benchmark", default=False, dest="use_throughput_benchmark", action="store_true")
+        "--benchmark-c2-net",
        "--benchmark_c2_net",
        default=False,
        dest="benchmark_c2_net",
        action="store_true",
    )
    parser.add_argument(
        "--use-throughput-benchmark",
        "--use_throughput_benchmark",
        default=False,
        dest="use_throughput_benchmark",
        action="store_true",
    )
    parser.add_argument("--debug", default=False, dest="debug", action="store_true")
    parser.add_argument("--save", default=False, dest="save", action="store_true")
-    parser.add_argument("--eager_mode", default=False, dest="eager_mode", action="store_true")
+    parser.add_argument("--eager-mode", "--eager_mode", default=False, dest="eager_mode", action="store_true")
-    parser.add_argument("--num_warmup_iters", type=int, default=100)
+    parser.add_argument("--num-warmup-iters", "--num_warmup_iters", type=int, default=100)
-    parser.add_argument("--num_iters", type=int, default=1000)
+    parser.add_argument("--num-iters", "--num_iters", type=int, default=1000)
    args = parser.parse_args()
    if args.op not in SUPPORTED_OPS:
--- a/benchmarks/instruction_counts/execution/work.py
+++ b/benchmarks/instruction_counts/execution/work.py
@ -100,7 +100,7 @@ class _BenchmarkProcess:
        cmd.extend([
            _PYTHON, WORKER_PATH,
-            "--communication_file", self._communication_file,
+            "--communication-file", self._communication_file,
        ])
        return " ".join(cmd)
--- a/benchmarks/instruction_counts/worker/main.py
+++ b/benchmarks/instruction_counts/worker/main.py
@ -183,6 +183,6 @@ def main(communication_file: str) -> None:
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('--communication_file', type=str)
+    parser.add_argument('--communication-file', '--communication_file', type=str)
    communication_file = parser.parse_args().communication_file
    main(communication_file)
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@ -28,19 +28,19 @@ $ python setup.py install
 Run `torch.add` benchmark:
 ```
 $ cd pytorch/benchmarks/operator_benchmark
-$ python -m pt.add_test --omp_num_threads 1 --mkl_num_threads 1
+$ python -m pt.add_test --omp-num-threads 1 --mkl-num-threads 1
 ```
-Note: we set the number of OpenMP and MKL threads both to 1. If you want to benchmark operators with multithreading (intra-op parallelism), use the `--omp_num_threads` and `--mkl_num_threads` flags.
+Note: we set the number of OpenMP and MKL threads both to 1. If you want to benchmark operators with multithreading (intra-op parallelism), use the `--omp-num-threads` and `--mkl-num-threads` flags.
 List all the supported tests:
 ```
-$ python -m pt.add_test --list_tests
+$ python -m pt.add_test --list-tests
 ```
 Filter and run a test (use `add_M8_N16_K32` as an example):
 ```
-$ python -m pt.add_test --test_name add_K32_M8_N1
+$ python -m pt.add_test --test-name add_K32_M8_N1
--omp_num_threads 1 --mkl_num_threads 1
+--omp-num-threads 1 --mkl-num-threads 1
 ```
 Run all the supported benchmarks:
@ -121,28 +121,28 @@ $ python benchmark_runner.py --help
 Run all the supported benchmarks:
 ```
-$ python -m benchmark_all_test --omp_num_threads 1 --mkl_num_threads 1
+$ python -m benchmark_all_test --omp-num-threads 1 --mkl-num-threads 1
 ```
 List all the supported operators:
 ```
-$ python -m benchmark_all_test --list_ops
+$ python -m benchmark_all_test --list-ops
 ```
 List all the supported tests:
 ```
-$ python -m benchmark_all_test --list_tests
+$ python -m benchmark_all_test --list-tests
 ```
 Filter and run an operator (use add as an example):
 ```
-$ python -m benchmark_all_test --operators add --omp_num_threads 1 --mkl_num_threads 1
+$ python -m benchmark_all_test --operators add --omp-num-threads 1 --mkl-num-threads 1
 ```
 Note: this filter is based on the operator name rather than the file name.
 Run torch.add benchmark with tag 'long':
 ```
-$ python -m pt.add_test --tag_filter long
+$ python -m pt.add_test --tag-filter long
 ```
 ## Adding New Operators to the Benchmark Suite
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@ -17,6 +17,7 @@ parser = argparse.ArgumentParser(
 def parse_args():
    parser.add_argument(
        '--tag-filter',
        '--tag_filter',
        help='tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)',
        default='short')
@ -28,21 +29,25 @@ def parse_args():
        default=None)
    parser.add_argument(
        '--operator-range',
        '--operator_range',
        help='Filter tests based on operator_range(e.g. a-c or b,c-d)',
        default=None)
    parser.add_argument(
        '--test-name',
        '--test_name',
        help='Run tests that have the provided test_name',
        default=None)
    parser.add_argument(
        '--list-ops',
        '--list_ops',
        help='List operators without running them',
        action='store_true')
    parser.add_argument(
        '--list-tests',
        '--list_tests',
        help='List all test cases without running them',
        action='store_true')
@ -54,6 +59,7 @@ def parse_args():
    )
    parser.add_argument(
        "--num-runs",
        "--num_runs",
        help="Run each test for num_runs. Each run executes an operator for number of <--iterations>",
        type=int,
@ -61,6 +67,7 @@ def parse_args():
    )
    parser.add_argument(
        "--min-time-per-test",
        "--min_time_per_test",
        help="Set the minimum time (unit: seconds) to run each test",
        type=int,
@ -68,6 +75,7 @@ def parse_args():
    )
    parser.add_argument(
        "--warmup-iterations",
        "--warmup_iterations",
        help="Number of iterations to ignore before measuring performance",
        default=100,
@ -75,6 +83,7 @@ def parse_args():
    )
    parser.add_argument(
        "--omp-num-threads",
        "--omp_num_threads",
        help="Number of OpenMP threads used in PyTorch/Caffe2 runtime",
        default=None,
@ -82,6 +91,7 @@ def parse_args():
    )
    parser.add_argument(
        "--mkl-num-threads",
        "--mkl_num_threads",
        help="Number of MKL threads used in PyTorch/Caffe2 runtime",
        default=None,
@ -89,6 +99,7 @@ def parse_args():
    )
    parser.add_argument(
        "--report-aibench",
        "--report_aibench",
        type=benchmark_utils.str2bool,
        nargs='?',
@ -98,6 +109,7 @@ def parse_args():
    )
    parser.add_argument(
        "--use-jit",
        "--use_jit",
        type=benchmark_utils.str2bool,
        nargs='?',
@ -107,6 +119,7 @@ def parse_args():
    )
    parser.add_argument(
        "--forward-only",
        "--forward_only",
        type=benchmark_utils.str2bool,
        nargs='?',
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@ -30,15 +30,15 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Profiler benchmark')
-    parser.add_argument('--with_cuda', action='store_true')
+    parser.add_argument('--with-cuda', '--with_cuda', action='store_true')
-    parser.add_argument('--with_stack', action='store_true')
+    parser.add_argument('--with-stack', '--with_stack', action='store_true')
-    parser.add_argument('--use_script', action='store_true')
+    parser.add_argument('--use-script', '--use_script', action='store_true')
-    parser.add_argument('--use_kineto', action='store_true')
+    parser.add_argument('--use-kineto', '--use_kineto', action='store_true')
-    parser.add_argument('--profiling_tensor_size', default=1, type=int)
+    parser.add_argument('--profiling-tensor-size', '--profiling_tensor_size', default=1, type=int)
-    parser.add_argument('--workload', default='loop', type=str)
+    parser.add_argument('--workload', '--workload', default='loop', type=str)
-    parser.add_argument('--internal_iter', default=256, type=int)
+    parser.add_argument('--internal-iter', '--internal_iter', default=256, type=int)
-    parser.add_argument('--timer_min_run_time', default=10, type=int)
+    parser.add_argument('--timer-min-run-time', '--timer_min_run_time', default=10, type=int)
-    parser.add_argument('--cuda_only', action='store_true')
+    parser.add_argument('--cuda-only', '--cuda_only', action='store_true')
    args = parser.parse_args()
--- a/benchmarks/record_function_benchmark/record_function_bench.py
+++ b/benchmarks/record_function_benchmark/record_function_bench.py
@ -92,7 +92,7 @@ if __name__ == '__main__':
    parser.add_argument('--lstmMiniBatch', default='64', type=int)
    parser.add_argument('--warmup', default='2', type=int)
    parser.add_argument('--nloops', default='50', type=int)
-    parser.add_argument('--timer_min_run_time', default=120, type=int)
+    parser.add_argument('--timer-min-run-time', '--timer_min_run_time', default=120, type=int)
    args = parser.parse_args()
--- a/benchmarks/sparse/dlmc/README.md
+++ b/benchmarks/sparse/dlmc/README.md
@ -4,7 +4,7 @@ These sets of benchmarks are for the sparse matrix functionality using a popular
 Performance benchmarks scripts for matrix-matrix and matrix-vector ops (dense-sparse, sparse-sparse, and compare to dense-dense) are implemented here.
- `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward_test`, on CPU or CUDA with `--with_cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
+- `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward-test`, on CPU or CUDA with `--with-cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
 - `matmul_bench.py` with `--operation sparse@vector` is for Sparse matrix-vector multiplication (SPMV) performance test.
--- a/benchmarks/sparse/dlmc/matmul_bench.py
+++ b/benchmarks/sparse/dlmc/matmul_bench.py
@ -41,11 +41,11 @@ def parse_args():
    parser = argparse.ArgumentParser(description='matmul benchmark')
    parser.add_argument('--path', type=str, help='DLMC dataset path')
    parser.add_argument('--dataset', type=str, default='magnitude_pruning')
-    parser.add_argument('--hidden_size', default=2048, type=int)
+    parser.add_argument('--hidden-size', '--hidden_size', default=2048, type=int)
-    parser.add_argument('--backward_test', action="store_true")
+    parser.add_argument('--backward-test', '--backward_test', action="store_true")
    parser.add_argument('--operation', type=str, help="|".join(OPS_MAP.keys()), default=next(iter(OPS_MAP)))
-    parser.add_argument('--with_cuda', action='store_true')
+    parser.add_argument('--with-cuda', '--with_cuda', action='store_true')
-    parser.add_argument('--timer_min_run_time', default=1, type=float)
+    parser.add_argument('--timer-min-run-time', '--timer_min_run_time', default=1, type=float)
    return parser
--- a/benchmarks/sparse/dlmc/test.sh
+++ b/benchmarks/sparse/dlmc/test.sh
@ -9,19 +9,19 @@ echo "!! SPARSE SPMS TIME BENCHMARK!! "
 # cpu
 python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --backward-test
 python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --backward-test
 python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector
 # cuda
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with_cuda
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with-cuda
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with_cuda--backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with-cuda --backward-test
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with_cuda
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with-cuda
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with_cuda --backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with-cuda --backward-test
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector --with_cuda
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector --with-cuda
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@ -70,9 +70,9 @@ if __name__ == "__main__":
    parser.add_argument("--m", default='1000', type=int)
    parser.add_argument("--n", default='1000', type=int)
    parser.add_argument("--k", default='1000', type=int)
-    parser.add_argument("--nnz_ratio", default='0.1', type=float)
+    parser.add_argument("--nnz-ratio", "--nnz_ratio", default='0.1', type=float)
    parser.add_argument("--outfile", default='stdout', type=str)
-    parser.add_argument("--test_count", default='10', type=int)
+    parser.add_argument("--test-count", "--test_count", default='10', type=int)
    args = parser.parse_args()
--- a/benchmarks/sparse/spmv.py
+++ b/benchmarks/sparse/spmv.py
@ -68,9 +68,9 @@ if __name__ == "__main__":
    parser.add_argument("--format", default='csr', type=str)
    parser.add_argument("--m", default='1000', type=int)
-    parser.add_argument("--nnz_ratio", default='0.1', type=float)
+    parser.add_argument("--nnz-ratio", "--nnz_ratio", default='0.1', type=float)
    parser.add_argument("--outfile", default='stdout', type=str)
-    parser.add_argument("--test_count", default='10', type=int)
+    parser.add_argument("--test-count", "--test_count", default='10', type=int)
    args = parser.parse_args()
--- a/benchmarks/sparse/test_csr.sh
+++ b/benchmarks/sparse/test_csr.sh
@ -18,8 +18,8 @@ cd benchmarks
 echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
 for dim0 in 1000 5000 10000; do
    for nnzr in 0.01 0.05 0.1 0.3; do
-        python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
+        python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
-        # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
+        # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
    done
 done
 echo "----------------------" >> $OUTFILE
@ -34,8 +34,8 @@ python setup.py install
 cd benchmarks
 for dim0 in 1000 5000 10000; do
    for nnzr in 0.01 0.05 0.1 0.3; do
-        python -m sparse.spmv --format csr --m $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
+        python -m sparse.spmv --format csr --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
-        python -m sparse.spmv --format coo --m $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
+        python -m sparse.spmv --format coo --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
    done
 done
 echo "----------------------" >> $OUTFILE
--- a/benchmarks/tensorexpr/HowToRun.md
+++ b/benchmarks/tensorexpr/HowToRun.md
@ -6,5 +6,5 @@ to show documentation.
 An example of an actual command line that one might use as a starting point:
 ```
-python -m benchmarks.tensorexpr --device gpu --mode fwd --jit_mode trace --cuda_fuser=te
+python -m benchmarks.tensorexpr --device gpu --mode fwd --jit-mode trace --cuda-fuser=te
 ```
--- a/benchmarks/tensorexpr/main.py
+++ b/benchmarks/tensorexpr/main.py
@ -67,30 +67,35 @@ Works only with Python3.\n A few examples:
        help="the underlying tensor engine. only pt for now",
    )
    parser.add_argument(
        "--jit-mode",
        "--jit_mode",
        type=str,
        default="trace",
        help="the jit mode to use: one of {trace, none}",
    )
    parser.add_argument(
        "--cuda-pointwise-loop-levels",
        "--cuda_pointwise_loop_levels",
        type=int,
        default=None,
        help="num of loop levesl for Cuda pointwise operations: 2 or 3",
    )
    parser.add_argument(
        "--cuda-pointwise-block-count",
        "--cuda_pointwise_block_count",
        type=int,
        default=None,
        help="num of block for Cuda pointwise operations",
    )
    parser.add_argument(
        "--cuda-pointwise-block-size",
        "--cuda_pointwise_block_size",
        type=int,
        default=None,
        help="num of blocks for Cuda pointwise operations",
    )
    parser.add_argument(
        "--cuda-fuser",
        "--cuda_fuser",
        type=str,
        default="te",
@ -118,12 +123,14 @@ Works only with Python3.\n A few examples:
        help="Disable shape randomization in dynamic benchmarks.",
    )
    parser.add_argument(
        "--cpu-fusion",
        "--cpu_fusion",
        default=False,
        action='store_true',
        help="Enable CPU fusion.",
    )
    parser.add_argument(
        "--cat-wo-conditionals",
        "--cat_wo_conditionals",
        default=False,
        action='store_true',
--- a/benchmarks/tensorexpr/microbenchmarks.py
+++ b/benchmarks/tensorexpr/microbenchmarks.py
@ -247,7 +247,7 @@ def dump_plot(df, sizes):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Runs NNC microbenchmarks')
-    parser.add_argument('--multi_threaded', action='store_true', help='Run with more than one thread')
+    parser.add_argument('--multi-threaded', '--multi_threaded', action='store_true', help='Run with more than one thread')
    args = parser.parse_args()
    if not args.multi_threaded:
        torch.set_num_threads(1)
--- a/benchmarks/transformer/better_transformer_vs_mha_functional.py
+++ b/benchmarks/transformer/better_transformer_vs_mha_functional.py
@ -185,8 +185,8 @@ def main(save_path: Optional[Path], error_path: Optional[Path]):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--save_path", type=str, help="Path to save the results")
+    parser.add_argument("--save-path", "--save_path", type=str, help="Path to save the results")
-    parser.add_argument("--error_save_path", type=str, help="Path to save the errors")
+    parser.add_argument("--error-save-path", "--error_save_path", type=str, help="Path to save the errors")
    args = parser.parse_args()
    save_path = Path(args.save_path) if args.save_path else None
--- a/benchmarks/transformer/sdp.py
+++ b/benchmarks/transformer/sdp.py
@ -339,7 +339,7 @@ def main(save_path: Optional[Path]):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--save_path", type=str, help="Path to save the results")
+    parser.add_argument("--save-path", "--save_path", type=str, help="Path to save the results")
    args = parser.parse_args()
    save_path = Path(args.save_path) if args.save_path else None
--- a/benchmarks/upload_scribe.py
+++ b/benchmarks/upload_scribe.py
@ -129,7 +129,7 @@ class PytorchBenchmarkUploader(ScribeUploader):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--pytest_bench_json", type=argparse.FileType('r'),
+    parser.add_argument("--pytest-bench-json", "--pytest_bench_json", type=argparse.FileType('r'),
                        help='Upload json data formatted by pytest-benchmark module')
    args = parser.parse_args()
    if args.pytest_bench_json:
--- a/binaries/bench_gen/bench_gen.py
+++ b/binaries/bench_gen/bench_gen.py
@ -67,16 +67,16 @@ if __name__ == "__main__":
    parser.add_argument("--context", help="Context to run on.", default="CPU")
    parser.add_argument("--kwargs", help="kwargs to pass to operator.",
                        nargs="*", type=parse_kwarg, default=[])
-    parser.add_argument("--init_net", help="Output initialization net.",
+    parser.add_argument("--init-net", "--init_net", help="Output initialization net.",
                        default="init_net.pb")
-    parser.add_argument("--predict_net", help="Output prediction net.",
+    parser.add_argument("--predict-net", "--predict_net", help="Output prediction net.",
                        default="predict_net.pb")
-    parser.add_argument("--benchmark_name",
+    parser.add_argument("--benchmark-name", "--benchmark_name",
                        help="Name of the benchmark network",
                        default="benchmark")
-    parser.add_argument("--input_name", help="Name of the input blob.",
+    parser.add_argument("--input-name", "--input_name", help="Name of the input blob.",
                        default="data")
-    parser.add_argument("--output_name", help="Name of the output blob.",
+    parser.add_argument("--output-name", "--output_name", help="Name of the output blob.",
                        default="output")
    parser.add_argument("--instances",
                        help="Number of instances to run the operator.",
--- a/docs/source/elastic/quickstart.rst
+++ b/docs/source/elastic/quickstart.rst
@ -7,11 +7,11 @@ To launch a **fault-tolerant** job, run the following on all nodes.
    torchrun
       --nnodes=NUM_NODES
-       --nproc_per_node=TRAINERS_PER_NODE
+       --nproc-per-node=TRAINERS_PER_NODE
-       --max_restarts=NUM_ALLOWED_FAILURES
+       --max-restarts=NUM_ALLOWED_FAILURES
-       --rdzv_id=JOB_ID
+       --rdzv-id=JOB_ID
-       --rdzv_backend=c10d
+       --rdzv-backend=c10d
-       --rdzv_endpoint=HOST_NODE_ADDR
+       --rdzv-endpoint=HOST_NODE_ADDR
       YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
@ -22,18 +22,18 @@ and at most ``MAX_SIZE`` nodes.
    torchrun
        --nnodes=MIN_SIZE:MAX_SIZE
-        --nproc_per_node=TRAINERS_PER_NODE
+        --nproc-per-node=TRAINERS_PER_NODE
-        --max_restarts=NUM_ALLOWED_FAILURES_OR_MEMBERSHIP_CHANGES
+        --max-restarts=NUM_ALLOWED_FAILURES_OR_MEMBERSHIP_CHANGES
-        --rdzv_id=JOB_ID
+        --rdzv-id=JOB_ID
-        --rdzv_backend=c10d
+        --rdzv-backend=c10d
-        --rdzv_endpoint=HOST_NODE_ADDR
+        --rdzv-endpoint=HOST_NODE_ADDR
        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 .. note::
   TorchElastic models failures as membership changes. When a node fails,
   this is treated as a "scale down" event. When the failed node is replaced by
   the scheduler, it is a "scale up" event. Hence for both fault tolerant
-   and elastic jobs, ``--max_restarts`` is used to control the total number of
+   and elastic jobs, ``--max-restarts`` is used to control the total number of
   restarts before giving up, regardless of whether the restart was caused
   due to a failure or a scaling event.
@ -47,8 +47,8 @@ ideally you should pick a node that has a high bandwidth.
 .. note::
   The ``--standalone`` option can be passed to launch a single node job with a
-   sidecar rendezvous backend. You don’t have to pass ``--rdzv_id``,
+   sidecar rendezvous backend. You don’t have to pass ``--rdzv-id``,
-   ``--rdzv_endpoint``, and ``--rdzv_backend`` when the ``--standalone`` option
+   ``--rdzv-endpoint``, and ``--rdzv-backend`` when the ``--standalone`` option
   is used.
--- a/docs/source/elastic/train_script.rst
+++ b/docs/source/elastic/train_script.rst
@ -21,7 +21,7 @@ working with ``torchrun`` with these differences:
   (see `elastic launch <run.html>`_).
 4. ``use_env`` flag has been removed. If you were parsing local rank by parsing
-   the ``--local_rank`` option, you need to get the local rank from the
+   the ``--local-rank`` option, you need to get the local rank from the
   environment variable ``LOCAL_RANK`` (e.g. ``int(os.environ["LOCAL_RANK"])``).
 Below is an expository example of a training script that checkpoints on each
--- a/functorch/examples/dp_cifar10/cifar10_opacus.py
+++ b/functorch/examples/dp_cifar10/cifar10_opacus.py
@ -449,6 +449,7 @@ def parse_args():
    )
    parser.add_argument(
        "--clip-per-layer",
        "--clip_per_layer",
        action="store_true",
        default=False,
--- a/functorch/examples/dp_cifar10/cifar10_transforms.py
+++ b/functorch/examples/dp_cifar10/cifar10_transforms.py
@ -472,6 +472,7 @@ def parse_args():
    )
    parser.add_argument(
        "--clip-per-layer",
        "--clip_per_layer",
        action="store_true",
        default=False,
--- a/functorch/examples/maml_omniglot/maml-omniglot-higher.py
+++ b/functorch/examples/maml_omniglot/maml-omniglot-higher.py
@ -46,15 +46,15 @@ plt.style.use('bmh')
 def main():
    argparser = argparse.ArgumentParser()
-    argparser.add_argument('--n_way', type=int, help='n way', default=5)
+    argparser.add_argument('--n-way', '--n_way', type=int, help='n way', default=5)
    argparser.add_argument(
-        '--k_spt', type=int, help='k shot for support set', default=5)
+        '--k-spt', '--k_spt', type=int, help='k shot for support set', default=5)
    argparser.add_argument(
-        '--k_qry', type=int, help='k shot for query set', default=15)
+        '--k-qry', '--k_qry', type=int, help='k shot for query set', default=15)
    argparser.add_argument(
        '--device', type=str, help='device', default='cuda')
    argparser.add_argument(
-        '--task_num',
+        '--task-num', '--task_num',
        type=int,
        help='meta batch size, namely task num',
        default=32)
--- a/functorch/examples/maml_omniglot/maml-omniglot-ptonly.py
+++ b/functorch/examples/maml_omniglot/maml-omniglot-ptonly.py
@ -46,15 +46,15 @@ plt.style.use('bmh')
 def main():
    argparser = argparse.ArgumentParser()
-    argparser.add_argument('--n_way', type=int, help='n way', default=5)
+    argparser.add_argument('--n-way', '--n_way', type=int, help='n way', default=5)
    argparser.add_argument(
-        '--k_spt', type=int, help='k shot for support set', default=5)
+        '--k-spt', '--k_spt', type=int, help='k shot for support set', default=5)
    argparser.add_argument(
-        '--k_qry', type=int, help='k shot for query set', default=15)
+        '--k-qry', '--k_qry', type=int, help='k shot for query set', default=15)
    argparser.add_argument(
        '--device', type=str, help='device', default='cuda')
    argparser.add_argument(
-        '--task_num',
+        '--task-num', '--task_num',
        type=int,
        help='meta batch size, namely task num',
        default=32)
--- a/functorch/examples/maml_omniglot/maml-omniglot-transforms.py
+++ b/functorch/examples/maml_omniglot/maml-omniglot-transforms.py
@ -47,15 +47,15 @@ plt.style.use('bmh')
 def main():
    argparser = argparse.ArgumentParser()
-    argparser.add_argument('--n_way', type=int, help='n way', default=5)
+    argparser.add_argument('--n-way', '--n_way', type=int, help='n way', default=5)
    argparser.add_argument(
-        '--k_spt', type=int, help='k shot for support set', default=5)
+        '--k-spt', '--k_spt', type=int, help='k shot for support set', default=5)
    argparser.add_argument(
-        '--k_qry', type=int, help='k shot for query set', default=15)
+        '--k-qry', '--k_qry', type=int, help='k shot for query set', default=15)
    argparser.add_argument(
        '--device', type=str, help='device', default='cuda')
    argparser.add_argument(
-        '--task_num',
+        '--task-num', '--task_num',
        type=int,
        help='meta batch size, namely task num',
        default=32)
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@ -17,11 +17,11 @@ Example Usages
 Create a new commitlist for consumption by categorize.py.
 Said commitlist contains commits between v1.5.0 and f5bc91f851.
-    python commitlist.py --create_new tags/v1.5.0 f5bc91f851
+    python commitlist.py --create-new tags/v1.5.0 f5bc91f851
 Update the existing commitlist to commit bfcb687b9c.
-    python commitlist.py --update_to bfcb687b9c
+    python commitlist.py --update-to bfcb687b9c
 """
@dataclasses.dataclass(frozen=True)
@ -342,16 +342,16 @@ def main():
    parser = argparse.ArgumentParser(description='Tool to create a commit list')
    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--create_new', nargs=2)
+    group.add_argument('--create-new', '--create_new', nargs=2)
-    group.add_argument('--update_to')
+    group.add_argument('--update-to', '--update_to')
    # I found this flag useful when experimenting with adding new auto-categorizing filters.
    # After running commitlist.py the first time, if you add any new filters in this file,
    # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file,
    # but only affect the rows that were previously marked as "Uncategorized"
-    group.add_argument('--rerun_with_new_filters', action='store_true')
+    group.add_argument('--rerun-with-new-filters', '--rerun_with_new_filters', action='store_true')
    group.add_argument('--stat', action='store_true')
-    group.add_argument('--export_markdown', action='store_true')
+    group.add_argument('--export-markdown', '--export_markdown', action='store_true')
-    group.add_argument('--export_csv_categories', action='store_true')
+    group.add_argument('--export-csv-categories', '--export_csv_categories', action='store_true')
    parser.add_argument('--path', default='results/commitlist.csv')
    args = parser.parse_args()
--- a/test/backends/xeon/test_launch.py
+++ b/test/backends/xeon/test_launch.py
@ -52,8 +52,8 @@ class TestTorchrun(TestCase):
    def test_multi_threads(self):
        num = 0
-        with subprocess.Popen(f"python -m torch.backends.xeon.run_cpu --ninstances 4 --use_default_allocator \
+        with subprocess.Popen(f"python -m torch.backends.xeon.run_cpu --ninstances 4 --use-default-allocator \
-            --disable_iomp --disable_numactl --log_path {self._test_dir} --no_python pwd",
+            --disable-iomp --disable-numactl --log-path {self._test_dir} --no-python pwd",
                              shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
            for line in p.stdout.readlines():
                segs = str(line, "utf-8").strip().split("-")
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@ -92,7 +92,7 @@ def elastic_launch_wrapper(
            rdzv_endpoint, min_nodes, max_nodes, nproc_per_node, run_id
        ),
        sys.executable,
-    )("-u", path("bin/test_script.py"), f"--touch_file_dir={test_dir}")
+    )("-u", path("bin/test_script.py"), f"--touch-file-dir={test_dir}")
 def _dist_sum(wait=0):
@ -163,7 +163,7 @@ class ElasticLaunchTest(unittest.TestCase):
        elastic_launch(
            get_test_launch_config(self._etcd_endpoint, nnodes, nnodes, nproc_per_node),
            sys.executable,
-        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+        )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
        # make sure all the workers ran.
        # each worker touches a file with its global rank as the name.
@ -178,7 +178,7 @@ class ElasticLaunchTest(unittest.TestCase):
        elastic_launch(
            get_test_launch_config(self._etcd_endpoint, nnodes, nnodes, nproc_per_node),
            sys.executable,
-        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+        )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
        # make sure all the workers ran.
        # each worker touches a file with its global rank as the name.
@ -248,7 +248,7 @@ class ElasticLaunchTest(unittest.TestCase):
        elastic_launch(
            get_test_launch_config(self._etcd_endpoint, 1, 2, nproc_per_node),
            sys.executable,
-        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+        )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
        world_size = nproc_per_node
        self.check_works_ran(world_size)
@ -283,7 +283,7 @@ class ElasticLaunchTest(unittest.TestCase):
            elastic_launch(
                get_test_launch_config(self._etcd_endpoint, 1, 2, 4),
                sys.executable,
-            )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+            )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
        record_mock.assert_called_once()
    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
@ -345,7 +345,7 @@ class ElasticLaunchTest(unittest.TestCase):
            elastic_launch(
                get_test_launch_config(self._etcd_endpoint, 1, 1, 4),
                sys.executable,
-            )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+            )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
            rdzv_handler_mock.shutdown.assert_called_once()
--- a/test/distributed/launcher/bin/test_script.py
+++ b/test/distributed/launcher/bin/test_script.py
@ -24,6 +24,7 @@ def parse_args():
    # file is used for assertions
    parser.add_argument(
        "--touch-file-dir",
        "--touch_file_dir",
        type=str,
        help="dir to touch a file with global rank as the filename",
--- a/test/distributed/launcher/bin/test_script_init_method.py
+++ b/test/distributed/launcher/bin/test_script_init_method.py
@ -19,12 +19,14 @@ def parse_args():
    parser = argparse.ArgumentParser(description="test script")
    parser.add_argument(
        "--init-method",
        "--init_method",
        type=str,
        required=True,
        help="init_method to pass to `dist.init_process_group()` (e.g. env://)",
    )
    parser.add_argument(
        "--world-size",
        "--world_size",
        type=int,
        default=os.getenv("WORLD_SIZE", -1),
--- a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
+++ b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
@ -27,6 +27,7 @@ import torch.distributed as dist
 def parse_args():
    parser = argparse.ArgumentParser(description="test script")
    parser.add_argument(
        "--out-file",
        "--out_file",
        help="file to write indicating whether this script was launched with torchelastic",
    )
--- a/test/distributed/launcher/bin/test_script_local_rank.py
+++ b/test/distributed/launcher/bin/test_script_local_rank.py
@ -15,6 +15,7 @@ def parse_args():
    parser = argparse.ArgumentParser(description="test script")
    parser.add_argument(
        "--local-rank",
        "--local_rank",
        type=int,
        required=True,
@ -31,7 +32,7 @@ def main():
    actual_rank = args.local_rank
    if expected_rank != actual_rank:
        raise RuntimeError(
-            "Parameters passed: --local_rank that has different value "
+            "Parameters passed: --local-rank that has different value "
            f"from env var: expected: {expected_rank}, got: {actual_rank}"
        )
    print("End execution")
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@ -47,12 +47,12 @@ class LaunchTest(unittest.TestCase):
            master_port = sock.getsockname()[1]
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--master_addr=localhost",
+            "--master-addr=localhost",
-            f"--master_port={master_port}",
+            f"--master-port={master_port}",
-            "--node_rank=0",
+            "--node-rank=0",
            path("bin/test_script_local_rank.py"),
        ]
        launch.main(args)
@ -69,15 +69,15 @@ class LaunchTest(unittest.TestCase):
            master_port = sock.getsockname()[1]
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--master_addr=localhost",
+            "--master-addr=localhost",
-            f"--master_port={master_port}",
+            f"--master-port={master_port}",
-            "--node_rank=0",
+            "--node-rank=0",
-            "--use_env",
+            "--use-env",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        launch.main(args)
        # make sure all the workers ran
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@ -101,14 +101,14 @@ class ElasticLaunchTest(unittest.TestCase):
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        launch.main(args)
@ -127,14 +127,14 @@ class ElasticLaunchTest(unittest.TestCase):
            master_port = sock.getsockname()[1]
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--master_addr=localhost",
+            "--master-addr=localhost",
-            f"--master_port={master_port}",
+            f"--master-port={master_port}",
-            "--node_rank=0",
+            "--node-rank=0",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        launch.main(args)
@ -152,19 +152,19 @@ class ElasticLaunchTest(unittest.TestCase):
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--no_python",
+            "--no-python",
        ]
        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
        with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
+            # --no-python cannot be used with --module
            launch.main(args + ["--module"] + script_args)
        launch.main(args + script_args)
@ -182,18 +182,18 @@ class ElasticLaunchTest(unittest.TestCase):
        world_size = 1
        args = [
            f"--nnodes={nnodes}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--no_python",
+            "--no-python",
        ]
        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
        with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
+            # --no-python cannot be used with --module
            launch.main(args + ["--module"] + script_args)
        launch.main(args + script_args)
@ -223,7 +223,7 @@ class ElasticLaunchTest(unittest.TestCase):
        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
        with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
+            # --no-python cannot be used with --module
            os.environ["PET_MODULE"] = "1"
            launch.main(script_args)
@ -242,13 +242,13 @@ class ElasticLaunchTest(unittest.TestCase):
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_type}",
+            f"--nproc-per-node={nproc_type}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--no_python",
+            "--no-python",
        ]
        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
@ -292,14 +292,14 @@ class ElasticLaunchTest(unittest.TestCase):
        world_size = nproc_per_node
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        launch.main(args)
@ -323,13 +323,13 @@ class ElasticLaunchTest(unittest.TestCase):
        nproc_per_node = 4
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--max_restarts=0",
+            "--max-restarts=0",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
            "--fail",
        ]
@ -354,15 +354,15 @@ class ElasticLaunchTest(unittest.TestCase):
        nproc_per_node = 4
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--max_restarts=0",
+            "--max-restarts=0",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        mock_agent_run.side_effect = MockException
@ -377,12 +377,12 @@ class ElasticLaunchTest(unittest.TestCase):
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
            "--standalone",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        launch.main(args)
@ -398,13 +398,13 @@ class ElasticLaunchTest(unittest.TestCase):
        nproc_per_node = 4
        world_size = nnodes * nproc_per_node
        args = [
-            "--run_path",
+            "--run-path",
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        launch.main(args)
@ -424,14 +424,14 @@ class ElasticLaunchTest(unittest.TestCase):
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--rdzv_backend=etcd",
+            "--rdzv-backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
+            f"--rdzv-id={run_id}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        procs = []
        for _ in range(nnodes - 1):
@ -466,11 +466,11 @@ class ElasticLaunchTest(unittest.TestCase):
        nproc_per_node = 4
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
            path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
        ]
        agent_mock = Mock()
        agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED)
@ -492,12 +492,12 @@ class ElasticLaunchTest(unittest.TestCase):
        launch.main(
            [
-                "--run_path",
+                "--run-path",
                "--nnodes=1",
-                "--nproc_per_node=1",
+                "--nproc-per-node=1",
-                "--monitor_interval=1",
+                "--monitor-interval=1",
                path("bin/test_script_is_torchelastic_launched.py"),
-                f"--out_file={out_file}",
+                f"--out-file={out_file}",
            ]
        )
@ -519,7 +519,7 @@ class ElasticLaunchTest(unittest.TestCase):
            "argv",
            [
                path("bin/test_script_is_torchelastic_launched.py"),
-                f"--out_file={out_file}",
+                f"--out-file={out_file}",
            ],
        ):
            runpy.run_path(sys.argv[0], run_name="__main__")
@ -534,9 +534,9 @@ class ElasticLaunchTest(unittest.TestCase):
            "argv",
            [
                path("bin/test_script_init_method.py"),
-                f"--init_method=tcp://localhost:{port}",
+                f"--init-method=tcp://localhost:{port}",
                "--rank=0",
-                "--world_size=1",
+                "--world-size=1",
            ],
        ):
            runpy.run_path(sys.argv[0], run_name="__main__")
@ -547,14 +547,14 @@ class ElasticLaunchTest(unittest.TestCase):
        port = get_free_port()
        launch.main(
            [
-                "--run_path",
+                "--run-path",
                "--nnodes=1",
-                "--nproc_per_node=4",
+                "--nproc-per-node=4",
-                "--master_addr=localhost",
+                "--master-addr=localhost",
-                f"--master_port={port}",
+                f"--master-port={port}",
-                "--monitor_interval=1",
+                "--monitor-interval=1",
                path("bin/test_script_init_method.py"),
-                f"--init_method=tcp://localhost:{port}",
+                f"--init-method=tcp://localhost:{port}",
            ]
        )
        # nothing to validate, just make sure it runs
@ -574,7 +574,7 @@ class ElasticLaunchTest(unittest.TestCase):
            "argv",
            [
                path("bin/test_script_init_method.py"),
-                "--init_method=env://",
+                "--init-method=env://",
            ],
        ):
            runpy.run_path(sys.argv[0], run_name="__main__")
@ -585,14 +585,14 @@ class ElasticLaunchTest(unittest.TestCase):
        port = get_free_port()
        launch.main(
            [
-                "--run_path",
+                "--run-path",
                "--nnodes=1",
-                "--nproc_per_node=4",
+                "--nproc-per-node=4",
-                "--master_addr=localhost",
+                "--master-addr=localhost",
-                f"--master_port={port}",
+                f"--master-port={port}",
-                "--monitor_interval=1",
+                "--monitor-interval=1",
                path("bin/test_script_init_method.py"),
-                "--init_method=env://",
+                "--init-method=env://",
            ]
        )
        # nothing to validate, just make sure it runs
--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@ -40,13 +40,13 @@ class TestDistributedLaunch(TestCase):
            master_port = sock.getsockname()[1]
        args = [
            f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
-            "--monitor_interval=1",
+            "--monitor-interval=1",
-            "--start_method=spawn",
+            "--start-method=spawn",
-            "--master_addr=localhost",
+            "--master-addr=localhost",
-            f"--master_port={master_port}",
+            f"--master-port={master_port}",
-            "--node_rank=0",
+            "--node-rank=0",
-            "--use_env",
+            "--use-env",
            path("bin/test_script.py"),
        ]
        launch.main(args)
--- a/test/edge/CMakeLists.txt
+++ b/test/edge/CMakeLists.txt
@ -9,12 +9,12 @@ file(GLOB_RECURSE all_python "${TORCH_ROOT}/torchgen/*.py")
 set(GEN_COMMAND
        "${PYTHON_EXECUTABLE}" -m torchgen.gen_executorch
        --source-path=${TEST_ROOT}
-        --install_dir=${OUTPUT_DIRECTORY}
+        --install-dir=${OUTPUT_DIRECTORY}
        --tags-path=${TORCH_ROOT}/aten/src/ATen/native/tags.yaml
-        --aten_yaml_path=${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml
+        --aten-yaml-path=${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml
-        --use_aten_lib
+        --use-aten-lib
-        --op_selection_yaml_path=${TEST_ROOT}/selected_operators.yaml
+        --op-selection-yaml-path=${TEST_ROOT}/selected_operators.yaml
-        --custom_ops_yaml_path=${TEST_ROOT}/custom_ops.yaml
+        --custom-ops-yaml-path=${TEST_ROOT}/custom_ops.yaml
        )
 set(GEN_COMMAND_sources
        ${OUTPUT_DIRECTORY}/RegisterCodegenUnboxedKernelsEverything.cpp
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@ -75,7 +75,7 @@ class TestFuser(JitTestCase):
            shell_env = os.environ.copy()
            shell_env['TMP'] = dname
            cmd = [sys.executable, os.path.basename(__file__), type(self).__name__ + '.test_abs_cpu']
-            legacy_jit_flag = '--jit_executor=legacy'
+            legacy_jit_flag = '--jit-executor=legacy'
            for v in sys.argv:
                if v == legacy_jit_flag:
                    cmd.append(legacy_jit_flag)
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 import sys
-sys.argv.append("--jit_executor=legacy")
+sys.argv.append("--jit-executor=legacy")
 from test_jit_fuser import *  # noqa: F403
 if __name__ == '__main__':
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 import sys
-sys.argv.append("--jit_executor=legacy")
+sys.argv.append("--jit-executor=legacy")
 from test_jit import *  # noqa: F403
 if __name__ == '__main__':
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 import sys
-sys.argv.append("--jit_executor=profiling")
+sys.argv.append("--jit-executor=profiling")
 from test_jit import *  # noqa: F403
 if __name__ == '__main__':
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 import sys
-sys.argv.append("--jit_executor=simple")
+sys.argv.append("--jit-executor=simple")
 from test_jit import *  # noqa: F403
 if __name__ == '__main__':
--- a/tools/code_analyzer/gen_operators_yaml.py
+++ b/tools/code_analyzer/gen_operators_yaml.py
@ -55,15 +55,15 @@ from torchgen.selective_build.selector import merge_kernel_metadata
 # There are a few main inputs to this application
 # -----------------------------------------------
 #
-# 1. Inference Root Operators (--root_ops): Root operators (called directly
+# 1. Inference Root Operators (--root-ops): Root operators (called directly
 #    from TorchScript) used by inference use-cases.
 #
-# 2. Training Root Operators (--training_root_ops): Root operators used
+# 2. Training Root Operators (--training-root-ops): Root operators used
 #    by training use-cases. Currently, this list is the list of all operators
 #    used by training, and not just the root operators. All Training ops are
 #    also considered for inference, so these are merged into inference ops.
 #
-# 3. Operator Depencency Graph (--dep_graph_yaml_path): A path to the
+# 3. Operator Depencency Graph (--dep-graph-yaml-path): A path to the
 #    operator dependency graph used to determine which operators depend on
 #    which other operators for correct functioning. This is used for
 #    generating the transitive closure of all the operators used by the
@ -71,12 +71,12 @@ from torchgen.selective_build.selector import merge_kernel_metadata
 #    For tracing based selective build, we don't need to perform this
 #    transitive cloure.
 #
-# 4. Model Metadata (--model_name, --model_versions, --model_assets,
+# 4. Model Metadata (--model-name, --model-versions, --model-assets,
-#    --model_backends): Self-descriptive. These are used to tell this
+#    --model-backends): Self-descriptive. These are used to tell this
 #    script which model operator lists to fetch from the Unified Model
 #    Build Metadata YAML file.
 #
-# 5. Unified Model YAML file (--models_yaml_path): A path to the Unified
+# 5. Unified Model YAML file (--models-yaml-path): A path to the Unified
 #    model YAML operator list file. This yaml file contains (for each
 #    model/version/asset/backend) the set of used root and traced
 #    operators. This is used to extract the actual set of operators
@ -490,45 +490,53 @@ def fill_output(output: Dict[str, object], options: object):
 def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
    parser.add_argument(
        "--root-ops",
        "--root_ops",
        help="A comma separated list of root operators used by the model",
        required=False,
    )
    parser.add_argument(
        "--training-root-ops",
        "--training_root_ops",
        help="A comma separated list of root operators used for training",
        required=False,
    )
    parser.add_argument(
        "--output-path",
        "--output_path",
        help="The location of the output yaml file.",
        required=True,
    )
    parser.add_argument(
        "--dep-graph-yaml-path",
        "--dep_graph_yaml_path",
        type=str,
        help="A path to the Operator Dependency Graph YAML file.",
        required=True,
    )
    parser.add_argument(
        "--model-name",
        "--model_name",
        type=str,
        help="The name of the model that uses the specified root operators.",
        required=True,
    )
    parser.add_argument(
        "--model-versions",
        "--model_versions",
        type=str,
        help="A comma separated list of model versions.",
        required=False,
    )
    parser.add_argument(
        "--model-assets",
        "--model_assets",
        type=str,
        help="A comma separate list of model asset names (if absent, defaults to all assets for this model).",
        required=False,
    )
    parser.add_argument(
        "--model-backends",
        "--model_backends",
        type=str,
        default="CPU",
@ -536,12 +544,14 @@ def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
        required=False,
    )
    parser.add_argument(
        "--models-yaml-path",
        "--models_yaml_path",
        type=str,
        help="The path to where the unified Mobile Model Config YAML resides.",
        required=True,
    )
    parser.add_argument(
        "--include-all-operators",
        "--include_all_operators",
        action="store_true",
        default=False,
@ -549,6 +559,7 @@ def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
        required=False,
    )
    parser.add_argument(
        "--rule-name",
        "--rule_name",
        type=str,
        help="The name of pt_operator_library rule resulting in this generation",
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@ -40,7 +40,7 @@ def throw_if_any_op_includes_overloads(selective_builder: SelectiveBuilder) -> N
        raise Exception(
            (
                "Operators that include all overloads are "
-                + "not allowed since --allow_include_all_overloads "
+                + "not allowed since --allow-include-all-overloads "
                + "was specified: {}"
            ).format(", ".join(ops))
        )
@ -99,6 +99,7 @@ def main(argv: List[Any]) -> None:
    """
    parser = argparse.ArgumentParser(description="Generate operator lists")
    parser.add_argument(
        "--output-dir",
        "--output_dir",
        help=(
            "The directory to store the output yaml files (selected_mobile_ops.h, "
@ -107,6 +108,7 @@ def main(argv: List[Any]) -> None:
        required=True,
    )
    parser.add_argument(
        "--model-file-list-path",
        "--model_file_list_path",
        help=(
            "Path to a file that contains the locations of individual "
@ -117,6 +119,7 @@ def main(argv: List[Any]) -> None:
        required=True,
    )
    parser.add_argument(
        "--allow-include-all-overloads",
        "--allow_include_all_overloads",
        help=(
            "Flag to allow operators that include all overloads. "
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@ -61,12 +61,13 @@ if __name__ == "__main__":
        description="Generate torch/version.py from build and environment metadata."
    )
    parser.add_argument(
        "--is-debug",
        "--is_debug",
        type=distutils.util.strtobool,
        help="Whether this build is debug mode or not.",
    )
-    parser.add_argument("--cuda_version", type=str)
+    parser.add_argument("--cuda-version", "--cuda_version", type=str)
-    parser.add_argument("--hip_version", type=str)
+    parser.add_argument("--hip-version", "--hip_version", type=str)
    args = parser.parse_args()
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@ -204,7 +204,11 @@ def main(args: List[str]) -> None:
        default="aten/src/ATen",
    )
    parser.add_argument(
-        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+        "-d",
        "--install-dir",
        "--install_dir",
        help="output directory",
        default="build/aten/src/ATen",
    )
    parser.add_argument(
        "-o",
@ -217,6 +221,7 @@ def main(args: List[str]) -> None:
        help="run without writing any files (still updates outputs)",
    )
    parser.add_argument(
        "--op-selection-yaml-path",
        "--op_selection_yaml_path",
        help="Provide a path to the operator selection (for custom build) YAML "
        "that contains the information about the set of selected operators "
@ -225,6 +230,7 @@ def main(args: List[str]) -> None:
        "The operator names also contain the namespace prefix (e.g. aten::)",
    )
    parser.add_argument(
        "--op-registration-allowlist",
        "--op_registration_allowlist",
        nargs="*",
        help="filter op registrations by the allowlist (if set); "
@ -232,6 +238,7 @@ def main(args: List[str]) -> None:
        "e.g.: aten::empty aten::conv2d ...",
    )
    parser.add_argument(
        "--TEST-ONLY-op-registration-allowlist-yaml-path",
        "--TEST_ONLY_op_registration_allowlist_yaml_path",
        help="Provide a path to the operator selection (for custom build) YAML "
        "which contains a list of operators. It is to serve testing purpose and "
--- a/tools/jit/test/test_gen_unboxing.py
+++ b/tools/jit/test/test_gen_unboxing.py
@ -17,7 +17,7 @@ class TestGenUnboxing(unittest.TestCase):
        mock_parse_native_yaml: NonCallableMock,
        mock_get_custom_build_selector: NonCallableMock,
    ) -> None:
-        args = ["--op_registration_allowlist=op1", "--op_selection_yaml_path=path2"]
+        args = ["--op-registration-allowlist=op1", "--op-selection-yaml-path=path2"]
        gen_unboxing.main(args)
        mock_get_custom_build_selector.assert_called_once_with(["op1"], "path2")
@ -32,8 +32,8 @@ class TestGenUnboxing(unittest.TestCase):
        temp_file.write(b"- aten::add.Tensor")
        temp_file.seek(0)
        args = [
-            f"--TEST_ONLY_op_registration_allowlist_yaml_path={temp_file.name}",
+            f"--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
-            "--op_selection_yaml_path=path2",
+            "--op-selection-yaml-path=path2",
        ]
        gen_unboxing.main(args)
        mock_get_custom_build_selector.assert_called_once_with(
@ -52,9 +52,9 @@ class TestGenUnboxing(unittest.TestCase):
        temp_file.write(b"- aten::add.Tensor")
        temp_file.seek(0)
        args = [
-            "--op_registration_allowlist=op1",
+            "--op-registration-allowlist=op1",
-            "--TEST_ONLY_op_registration_allowlist_yaml_path={temp_file.name}",
+            "--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
-            "--op_selection_yaml_path=path2",
+            "--op-selection-yaml-path=path2",
        ]
        gen_unboxing.main(args)
        mock_get_custom_build_selector.assert_called_once_with(["op1"], "path2")
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@ -204,6 +204,7 @@ def main() -> None:
        help="clang-tidy binary path",
    )
    parser.add_argument(
        "--build-dir",
        "--build_dir",
        required=True,
        help=(
--- a/tools/linter/clang_tidy/generate_build_files.py
+++ b/tools/linter/clang_tidy/generate_build_files.py
@ -59,7 +59,7 @@ def run_autogen() -> None:
            "aten/src/ATen/native/native_functions.yaml",
            "--tags-path",
            "aten/src/ATen/native/tags.yaml",
-            "--gen_lazy_ts_backend",
+            "--gen-lazy-ts-backend",
        ]
    )
--- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py
+++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
@ -147,6 +147,7 @@ def main() -> None:
    )
    parser.add_argument(
        "-p",
        "--yaml-file-path",
        "--yaml_file_path",
        type=str,
        required=True,
@ -154,6 +155,7 @@ def main() -> None:
    )
    parser.add_argument(
        "-o",
        "--output-file-path",
        "--output_file_path",
        type=str,
        required=True,
--- a/tools/onnx/update_default_opset_version.py
+++ b/tools/onnx/update_default_opset_version.py
@ -107,6 +107,9 @@ def main(args: Any) -> None:
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--skip_build", action="store_true", help="Skip building pytorch"
+        "--skip-build",
        "--skip_build",
        action="store_true",
        help="Skip building pytorch",
    )
    main(parser.parse_args())
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@ -138,6 +138,7 @@ def main() -> None:
        help="Root directory where to install files. Defaults to the current working directory.",
    )
    parser.add_argument(
        "--install-dir",
        "--install_dir",
        help=(
            "Deprecated. Use --gen-dir instead. The semantics are different, do not change "
@ -159,21 +160,25 @@ def main() -> None:
        help="Path to the YAML file that contains the list of operators to include for custom build.",
    )
    parser.add_argument(
        "--operators-yaml-path",
        "--operators_yaml_path",
        help="Path to the model YAML file that contains the list of operators to include for custom build.",
    )
    parser.add_argument(
        "--force-schema-registration",
        "--force_schema_registration",
        action="store_true",
        help="force it to generate schema-only registrations for ops that are not"
        "listed on --selected-op-list",
    )
    parser.add_argument(
        "--gen-lazy-ts-backend",
        "--gen_lazy_ts_backend",
        action="store_true",
        help="Enable generation of the torch::lazy TorchScript backend",
    )
    parser.add_argument(
        "--per-operator-headers",
        "--per_operator_headers",
        action="store_true",
        help="Build lazy tensor ts backend with per-operator ATen headers, must match how ATen was built",
--- a/tools/substitute.py
+++ b/tools/substitute.py
@ -7,7 +7,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-file")
    parser.add_argument("--output-file")
-    parser.add_argument("--install_dir")
+    parser.add_argument("--install-dir", "--install_dir")
    parser.add_argument("--replace", action="append", nargs=2)
    options = parser.parse_args()
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -387,9 +387,9 @@ add_custom_command(
    "${PYTHON_EXECUTABLE}" -c \"from pathlib import Path\; Path('${TOOLS_PATH}/generate_torch_version.py').touch()\"
  COMMAND
    "${PYTHON_EXECUTABLE}" ${TOOLS_PATH}/generate_torch_version.py
-      --is_debug=${TORCH_VERSION_DEBUG}
+      --is-debug=${TORCH_VERSION_DEBUG}
-      --cuda_version=${CUDA_VERSION}
+      --cuda-version=${CUDA_VERSION}
-      --hip_version=${HIP_VERSION}
+      --hip-version=${HIP_VERSION}
  DEPENDS ${TOOLS_PATH}/generate_torch_version.py
  WORKING_DIRECTORY ${TORCH_ROOT}
 )
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
@ -50,7 +50,7 @@ The benchmark codes depend on the [DLRM codebase](https://github.com/facebookres
 ### **Disk savings**
 ```
-python evaluate_disk_savings.py --model_path=<path_to_model_checkpoint> --sparsified_model_dump_path=<path_to_dump_sparsified_models>
+python evaluate_disk_savings.py --model-path=<path_to_model_checkpoint> --sparsified-model-dump-path=<path_to_dump_sparsified_models>
 ```
 Running this script should dump
@ -62,13 +62,13 @@ Running this script should dump
 ### **Model Quality**
 ```
-python evaluate_model_metrics.py --raw_data_file=<path_to_raw_data_txt_file> --processed_data_file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse_model_metadata=<path_to_sparse_model_metadata_csv>
+python evaluate_model_metrics.py --raw-data-file=<path_to_raw_data_txt_file> --processed-data-file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse-model-metadata=<path_to_sparse_model_metadata_csv>
 ```
 Running this script should dump ```sparse_model_metrics.csv``` that contains evaluation metrics for all sparsified models.
 ### **Model forward time**:
 ```
-python evaluate_forward_time.py --raw_data_file=<path_to_raw_data_txt_file> --processed_data_file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse_model_metadata=<path_to_sparse_model_metadata_csv>
+python evaluate_forward_time.py --raw-data-file=<path_to_raw_data_txt_file> --processed-data-file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse-model-metadata=<path_to_sparse_model_metadata_csv>
 ```
 Running this script should dump ```dlrm_forward_time_info.csv``` that contains forward time for all sparsified models with and without torch.sparse in the forward pass.
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@ -152,8 +152,8 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_path', type=str)
+    parser.add_argument('--model-path', '--model_path', type=str)
-    parser.add_argument('--sparsified_model_dump_path', type=str)
+    parser.add_argument('--sparsified-model-dump-path', '--sparsified_model_dump_path', type=str)
    args = parser.parse_args()
    sparsify_model(args.model_path, args.sparsified_model_dump_path)
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
@ -85,9 +85,9 @@ def measure_forward_pass(sparse_model_metadata, device, sparse_dlrm, **batch):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--raw_data_file', type=str)
+    parser.add_argument('--raw-data-file', '--raw_data_file', type=str)
-    parser.add_argument('--processed_data_file', type=str)
+    parser.add_argument('--processed-data-file', '--processed_data_file', type=str)
-    parser.add_argument('--sparse_model_metadata', type=str)
+    parser.add_argument('--sparse-model-metadata', '--sparse_model_metadata', type=str)
    args = parser.parse_args()
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_model_metrics.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_model_metrics.py
@ -119,9 +119,9 @@ def evaluate_metrics(test_dataloader, sparse_model_metadata):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--raw_data_file', type=str)
+    parser.add_argument('--raw-data-file', '--raw_data_file', type=str)
-    parser.add_argument('--processed_data_file', type=str)
+    parser.add_argument('--processed-data-file', '--processed_data_file', type=str)
-    parser.add_argument('--sparse_model_metadata', type=str)
+    parser.add_argument('--sparse-model-metadata', '--sparse_model_metadata', type=str)
    args = parser.parse_args()
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@ -554,7 +554,7 @@ class emit_itt:
    It is useful when running the program under Intel(R) VTune Profiler::
-        vtune <--vtune_flags> <regular command here>
+        vtune <--vtune-flags> <regular command here>
    The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
    control the collection of trace data during its execution across different Intel tools.
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@ -60,20 +60,20 @@ Single instance inference
 ::
-   python -m torch.backends.xeon.run_cpu --throughput_mode script.py args
+   python -m torch.backends.xeon.run_cpu --throughput-mode script.py args
 2. Run single-instance inference on a single CPU node.
 ::
-   python -m torch.backends.xeon.run_cpu --node_id 1 script.py args
+   python -m torch.backends.xeon.run_cpu --node-id 1 script.py args
 Multi-instance inference
 ------------------------
 1. Multi-instance
   By default this tool runs one process per node. If you want to set the instance numbers and core per instance,
-   --ninstances and  --ncores_per_instance should be set.
+   --ninstances and  --ncores-per-instance should be set.
 ::
@ -83,7 +83,7 @@ Multi-instance inference
 ::
-   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores_per_instance 4 python_script args
+   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores-per-instance 4 python_script args
 2. Run single-instance inference among multiple instances.
   By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank.
@ -105,7 +105,7 @@ Multi-instance inference
 ::
-   python -m torch.backends.xeon.run_cpu --core_list "0, 1, 2, 3" --ninstances 2 --ncores_per_instance 2
+   python -m torch.backends.xeon.run_cpu --core-list "0, 1, 2, 3" --ninstances 2 --ncores-per-instance 2
   --rank 0 python_script args
 3. To look up what optional arguments this module offers:
@ -117,7 +117,7 @@ Multi-instance inference
 Memory allocator
 ----------------
-"--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator.
+"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator.
 """
@ -233,8 +233,8 @@ class _CPUinfo():
                numa_ids.append(numa_id)
        if len(numa_ids) > 1:
            logger.warning(f"Numa Aware: cores:{str(core_list)} on different NUMA nodes:{str(numa_ids)}. To avoid \
-this behavior, please use --ncores_per_instance knob to make sure number of cores is divisible by --ncores_per_\
+this behavior, please use --ncores-per-instance knob to make sure number of cores is divisible by --ncores-per-\
-instance. Alternatively, please use --skip_cross_node_cores knob.")
+instance. Alternatively, please use --skip-cross-node-cores knob.")
        if len(numa_ids) == 0:
            raise RuntimeError("invalid number of NUMA nodes; please make sure numa_ids >= 1")
        return numa_ids
@ -376,7 +376,7 @@ Value applied: {os.environ[env_name]}. Value ignored: {env_value}")
        if args.core_list:  # user specify what cores will be used by params
            cores = [int(x) for x in args.core_list.split(",")]
            if args.ncores_per_instance == -1:
-                raise RuntimeError("please specify the \"--ncores_per_instance\" if you have pass the --core_list params")
+                raise RuntimeError("please specify the \"--ncores-per-instance\" if you have pass the --core-list params")
            elif args.ninstances > 1 and args.ncores_per_instance * args.ninstances < len(cores):
                logger.warning(f"only first {args.ncores_per_instance * args.ninstances} cores will be used, \
 but you specify {len(cores)} cores in core_list")
@ -417,17 +417,17 @@ please make sure ninstances <= total_cores)")
                    if args.ncores_per_instance > ncore_per_node:
                        # too many ncores_per_instance to skip cross-node cores
                        logger.warning("there are {} core(s) per socket, but you specify {} ncores_per_instance and \
-skip_cross_node_cores. Please make sure --ncores_per_instance < core(s) per \
+skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \
 socket".format(ncore_per_node, args.ncores_per_instance))
                        exit(-1)
                    elif num_leftover_cores == 0:
                        # aren't any cross-node cores
-                        logger.info('--skip_cross_node_cores is set, but there are no cross-node cores.')
+                        logger.info('--skip-cross-node-cores is set, but there are no cross-node cores.')
                        args.ninstances = len(cores) // args.ncores_per_instance
                    else:
                        # skip cross-node cores
                        if args.ninstances != -1:
-                            logger.warning('--skip_cross_node_cores is exclusive to --ninstances. --ninstances \
+                            logger.warning('--skip-cross-node-cores is exclusive to --ninstances. --ninstances \
 won\'t take effect even if it is set explicitly.')
                        i = 1
@ -442,15 +442,15 @@ won\'t take effect even if it is set explicitly.')
                if args.ninstances * args.ncores_per_instance > len(cores):
                    raise RuntimeError("Please make sure ninstances * ncores_per_instance <= total_cores")
            if args.latency_mode:
-                logger.warning("--latency_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \
+                logger.warning("--latency-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
--use_logical_core. They won't take effect even they are set explicitly.")
+--use-logical-core. They won't take effect even they are set explicitly.")
                args.ncores_per_instance = 4
                cores = self.cpuinfo.get_all_physical_cores()
                args.ninstances = len(cores) // args.ncores_per_instance
            if args.throughput_mode:
-                logger.warning("--throughput_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \
+                logger.warning("--throughput-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
--use_logical_core. They won't take effect even they are set explicitly.")
+--use-logical-core. They won't take effect even they are set explicitly.")
                args.ninstances = self.cpuinfo.node_nums
                cores = self.cpuinfo.get_all_physical_cores()
                args.ncores_per_instance = len(cores) // args.ninstances
@ -531,48 +531,48 @@ def _add_memory_allocator_params(parser):
    group = parser.add_argument_group("Memory Allocator Parameters")
    # allocator control
-    group.add_argument("--enable_tcmalloc", action="store_true", default=False,
+    group.add_argument("--enable-tcmalloc", "--enable_tcmalloc", action="store_true", default=False,
                       help="Enable tcmalloc allocator")
-    group.add_argument("--enable_jemalloc", action="store_true", default=False,
+    group.add_argument("--enable-jemalloc", "--enable_jemalloc", action="store_true", default=False,
                       help="Enable jemalloc allocator")
-    group.add_argument("--use_default_allocator", action="store_true", default=False,
+    group.add_argument("--use-default-allocator", "--use_default_allocator", action="store_true", default=False,
                       help="Use default memory allocator")
 def _add_multi_instance_params(parser):
    group = parser.add_argument_group("Multi-instance Parameters")
    # multi-instance control
-    group.add_argument("--ncores_per_instance", metavar="\b", default=-1, type=int,
+    group.add_argument("--ncores-per-instance", "--ncores_per_instance", metavar="\b", default=-1, type=int,
                       help="Cores per instance")
    group.add_argument("--ninstances", metavar="\b", default=-1, type=int,
                       help="For multi-instance, you should give the cores number you used for per instance.")
-    group.add_argument("--skip_cross_node_cores", action='store_true', default=False,
+    group.add_argument("--skip-cross-node-cores", "--skip_cross_node_cores", action='store_true', default=False,
-                       help="If specified --ncores_per_instance, skips cross-node cores.")
+                       help="If specified --ncores-per-instance, skips cross-node cores.")
    group.add_argument("--rank", metavar="\b", default="-1", type=int,
                       help="Specify instance index to assign ncores_per_instance for rank; \
 otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \
 https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md")
-    group.add_argument("--latency_mode", action="store_true", default=False,
+    group.add_argument("--latency-mode", "--latency_mode", action="store_true", default=False,
                       help="By detault 4 core per instance and use all physical cores")
-    group.add_argument("--throughput_mode", action="store_true", default=False,
+    group.add_argument("--throughput-mode", "--throughput_mode", action="store_true", default=False,
                       help="By default one instance per node and use all physical cores")
-    group.add_argument("--node_id", metavar="\b", default=-1, type=int,
+    group.add_argument("--node-id", "--node_id", metavar="\b", default=-1, type=int,
                       help="node id for multi-instance, by default all nodes will be used")
-    group.add_argument("--use_logical_core", action="store_true", default=False,
+    group.add_argument("--use-logical-core", "--use_logical_core", action="store_true", default=False,
                       help="Whether only use physical cores")
-    group.add_argument("--disable_numactl", action="store_true", default=False,
+    group.add_argument("--disable-numactl", "--disable_numactl", action="store_true", default=False,
                       help="Disable numactl")
-    group.add_argument("--core_list", metavar="\b", default=None, type=str,
+    group.add_argument("--core-list", "--core_list", metavar="\b", default=None, type=str,
                       help="Specify the core list as \"core_id, core_id, ....\", otherwise, all the cores will be used.")
-    group.add_argument("--log_path", metavar="\b", default="", type=str,
+    group.add_argument("--log-path", "--log_path", metavar="\b", default="", type=str,
                       help="The log file directory. Default path is "", which means disable logging to files.")
-    group.add_argument("--log_file_prefix", metavar="\b", default="run", type=str,
+    group.add_argument("--log-file-prefix", "--log_file_prefix", metavar="\b", default="run", type=str,
                       help="log file prefix")
 def _add_kmp_iomp_params(parser):
    group = parser.add_argument_group("IOMP Parameters")
-    group.add_argument("--disable_iomp", action="store_true", default=False,
+    group.add_argument("--disable-iomp", "--disable_iomp", action="store_true", default=False,
                       help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD")
 def create_args(parser=None):
@ -580,7 +580,7 @@ def create_args(parser=None):
    Helper function parsing the command line options
    @retval ArgumentParser
    """
-    parser.add_argument("--multi_instance", action="store_true", default=False,
+    parser.add_argument("--multi-instance", "--multi_instance", action="store_true", default=False,
                        help="Enable multi-instance, by default one instance per node")
    parser.add_argument("-m", "--module", default=False, action="store_true",
@ -588,7 +588,7 @@ def create_args(parser=None):
                             "as a python module, executing with the same behavior as"
                             "\"python -m\".")
-    parser.add_argument("--no_python", default=False, action="store_true",
+    parser.add_argument("--no-python", "--no_python", default=False, action="store_true",
                        help="Do not prepend the --program script with \"python\" - just exec "
                             "it directly. Useful when the script is not a Python script.")
@ -618,7 +618,7 @@ def main(args):
        raise RuntimeError("Either args.latency_mode or args.throughput_mode should be set")
    if not args.no_python and not args.program.endswith(".py"):
-        raise RuntimeError("For non Python script, you should use \"--no_python\" parameter.")
+        raise RuntimeError("For non Python script, you should use \"--no-python\" parameter.")
    # Verify LD_PRELOAD
    if "LD_PRELOAD" in os.environ:
@ -653,7 +653,7 @@ if __name__ == "__main__":
                                        "\n   >>> python -m torch.backends.xeon.run_cpu python_script args \n"
                                        "\n2. multi-instance \n"
                                        "\n   >>> python -m torch.backends.xeon.run_cpu --ninstances xxx "
-                                        "--ncores_per_instance xx python_script args\n"
+                                        "--ncores-per-instance xx python_script args\n"
                                        "\n############################################################################# \n",
                                        formatter_class=RawTextHelpFormatter)
    create_args(parser)
--- a/torch/csrc/jit/tensorexpr/codegen_external.py
+++ b/torch/csrc/jit/tensorexpr/codegen_external.py
@ -80,13 +80,15 @@ const static RegisterNNCExternalFunction nnc_{name}(
 def main() -> None:
    parser = argparse.ArgumentParser(
        description='Generate annotated_fn_args script')
-    parser.add_argument('--native_functions',
+    parser.add_argument('--native-functions',
                        '--native_functions',
                        help='path to native_functions.yaml',
                        default='../../../../aten/src/ATen/native/native_functions.yaml')
    parser.add_argument('--tags',
                        help='path to tags.yaml',
                        default='../../../../aten/src/ATen/native/tags.yaml')
-    parser.add_argument('--template_path',
+    parser.add_argument('--template-path',
                        '--template_path',
                        help='path to external_functions_codegen_template.cpp',
                        default='../../../../tools/jit/templates/external_functions_codegen_template.cpp')
    args = parser.parse_args()
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@ -113,7 +113,7 @@ class LocalElasticAgent(SimpleElasticAgent):
                        role="trainer",
                        local_world_size=nproc_per_process,
                        entrypoint="/usr/local/bin/trainer",
-                        args=("--trainer_args", "foobar"),
+                        args=("--trainer-args", "foobar"),
                        ...<OTHER_PARAMS...>)
            agent = LocalElasticAgent(spec)
            results = agent.run()
--- a/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
@ -83,18 +83,18 @@ def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler:
    if "rank" not in params.config:
        raise ValueError(
            "rank is absent in RendezvousParameters."
-            "Try add --node_rank to the cmd request"
+            "Try add --node-rank to the cmd request"
        )
    endpoint = params.endpoint.strip()
    if not endpoint:
        raise ValueError(
            "endpoint is absent in RendezvousParameters"
-            "Try add --master_port and --master_addr to the cmd request"
+            "Try add --master-port and --master-addr to the cmd request"
        )
    master_addr, master_port = parse_rendezvous_endpoint(endpoint, -1)
    if master_port == -1:
        raise ValueError(
-            f"Port is absent in endpoint: {endpoint}. Try launching with --master_port"
+            f"Port is absent in endpoint: {endpoint}. Try launching with --master-port"
        )
    world_size = params.max_nodes
    rank = cast(int, params.config.get("rank"))
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@ -19,7 +19,7 @@ aggregated communication bandwidth.
 In both cases of single-node distributed training or multi-node distributed
 training, this utility will launch the given number of processes per node
-(``--nproc_per_node``). If used for GPU training, this number needs to be less
+(``--nproc-per-node``). If used for GPU training, this number needs to be less
 or equal to the number of GPUs on the current system (``nproc_per_node``),
 and each process will be operating on a single GPU from *GPU 0 to
 GPU (nproc_per_node - 1)*.
@ -30,7 +30,7 @@ GPU (nproc_per_node - 1)*.
 ::
-    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
               arguments of your training script)
@ -41,18 +41,18 @@ Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
 ::
-    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
-               --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
+               --nnodes=2 --node-rank=0 --master-addr="192.168.1.1"
-               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
               and all other arguments of your training script)
 Node 2:
 ::
-    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
-               --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
+               --nnodes=2 --node-rank=1 --master-addr="192.168.1.1"
-               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
               and all other arguments of your training script)
 3. To look up what optional arguments this module offers:
@ -70,7 +70,7 @@ the NCCL distributed backend. Thus NCCL backend is the recommended backend to
 use for GPU training.
 2. In your training program, you must parse the command-line argument:
-``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
+``--local-rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
 If your training program uses GPUs, you should ensure that your code only
 runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
@ -81,7 +81,7 @@ Parsing the local_rank argument
    >>> # xdoctest: +SKIP
    >>> import argparse
    >>> parser = argparse.ArgumentParser()
-    >>> parser.add_argument("--local_rank", type=int)
+    >>> parser.add_argument("--local-rank", type=int)
    >>> args = parser.parse_args()
 Set your device to local rank using either
@ -128,9 +128,9 @@ utility
 5. Another way to pass ``local_rank`` to the subprocesses via environment variable
 ``LOCAL_RANK``. This behavior is enabled when you launch the script with
-``--use_env=True``. You must adjust the subprocess example above to replace
+``--use-env=True``. You must adjust the subprocess example above to replace
 ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
-will not pass ``--local_rank`` when you specify this flag.
+will not pass ``--local-rank`` when you specify this flag.
 .. warning::
@ -156,13 +156,14 @@ logger = logging.getLogger(__name__)
 def parse_args(args):
    parser = get_args_parser()
    parser.add_argument(
        "--use-env",
        "--use_env",
        default=False,
        action="store_true",
        help="Use environment variable to pass "
        "'local rank'. For legacy reasons, the default value is False. "
        "If set to True, the script will not pass "
-        "--local_rank as argument, and will instead set LOCAL_RANK.",
+        "--local-rank as argument, and will instead set LOCAL_RANK.",
    )
    return parser.parse_args(args)
@ -170,8 +171,8 @@ def parse_args(args):
 def launch(args):
    if args.no_python and not args.use_env:
        raise ValueError(
-            "When using the '--no_python' flag,"
+            "When using the '--no-python' flag,"
-            " you must also set the '--use_env' flag."
+            " you must also set the '--use-env' flag."
        )
    run(args)
@ -180,8 +181,8 @@ def main(args=None):
    warnings.warn(
        "The module torch.distributed.launch is deprecated\n"
        "and will be removed in future. Use torchrun.\n"
-        "Note that --use_env is set by default in torchrun.\n"
+        "Note that --use-env is set by default in torchrun.\n"
-        "If your script expects `--local_rank` argument to be set, please\n"
+        "If your script expects `--local-rank` argument to be set, please\n"
        "change it to read from `os.environ['LOCAL_RANK']` instead. See \n"
        "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n"
        "further instructions\n",
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@ -165,12 +165,12 @@ def _get_addr_and_port(
    endpoint = endpoint.strip()
    if not endpoint:
        raise ValueError(
-            "Endpoint is missing in endpoint. Try to add --master_addr and --master_port"
+            "Endpoint is missing in endpoint. Try to add --master-addr and --master-port"
        )
    master_addr, master_port = parse_rendezvous_endpoint(endpoint, default_port=-1)
    if master_port == -1:
        raise ValueError(
-            f"port is missing in endpoint: {endpoint}. Try to specify --master_port"
+            f"port is missing in endpoint: {endpoint}. Try to specify --master-port"
        )
    return (master_addr, master_port)
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@ -30,11 +30,11 @@ Transitioning from torch.distributed.launch to torchrun
 ``torchrun`` supports the same arguments as ``torch.distributed.launch`` **except**
-for ``--use_env`` which is now deprecated. To migrate from ``torch.distributed.launch``
+for ``--use-env`` which is now deprecated. To migrate from ``torch.distributed.launch``
 to ``torchrun`` follow these steps:
 1.  If your training script is already reading ``local_rank`` from the ``LOCAL_RANK`` environment variable.
-    Then you need simply omit the ``--use_env`` flag, e.g.:
+    Then you need simply omit the ``--use-env`` flag, e.g.:
    +--------------------------------------------------------------------+--------------------------------------------+
    |         ``torch.distributed.launch``                               |                ``torchrun``                |
@ -42,11 +42,11 @@ to ``torchrun`` follow these steps:
    |                                                                    |                                            |
    | .. code-block:: shell-session                                      | .. code-block:: shell-session              |
    |                                                                    |                                            |
-    |    $ python -m torch.distributed.launch --use_env train_script.py  |    $ torchrun train_script.py              |
+    |    $ python -m torch.distributed.launch --use-env train_script.py  |    $ torchrun train_script.py              |
    |                                                                    |                                            |
    +--------------------------------------------------------------------+--------------------------------------------+
-2.  If your training script reads local rank from a ``--local_rank`` cmd argument.
+2.  If your training script reads local rank from a ``--local-rank`` cmd argument.
    Change your training script to read from the ``LOCAL_RANK`` environment variable as
    demonstrated by the following code snippet:
@ -59,7 +59,7 @@ to ``torchrun`` follow these steps:
    |                                                       |                                                    |
    |    import argparse                                    |     import os                                      |
    |    parser = argparse.ArgumentParser()                 |     local_rank = int(os.environ["LOCAL_RANK"])     |
-    |    parser.add_argument("--local_rank", type=int)      |                                                    |
+    |    parser.add_argument("--local-rank", type=int)      |                                                    |
    |    args = parser.parse_args()                         |                                                    |
    |                                                       |                                                    |
    |    local_rank = args.local_rank                       |                                                    |
@ -85,7 +85,7 @@ Single-node multi-worker
    torchrun
        --standalone
        --nnodes=1
-        --nproc_per_node=$NUM_TRAINERS
+        --nproc-per-node=$NUM_TRAINERS
        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 Stacked single-node multi-worker
@ -94,18 +94,18 @@ Stacked single-node multi-worker
 To run multiple instances (separate jobs) of single-node, multi-worker on the
 same host, we need to make sure that each instance (job) is
 setup on different ports to avoid port conflicts (or worse, two jobs being merged
-as a single job). To do this you have to run with ``--rdzv_backend=c10d``
+as a single job). To do this you have to run with ``--rdzv-backend=c10d``
-and specify a different port by setting ``--rdzv_endpoint=localhost:$PORT_k``.
+and specify a different port by setting ``--rdzv-endpoint=localhost:$PORT_k``.
 For ``--nodes=1``, its often convenient to let ``torchrun`` pick a free random
 port automatically instead of manually assgining different ports for each run.
 ::
    torchrun
-        --rdzv_backend=c10d
+        --rdzv-backend=c10d
-        --rdzv_endpoint=localhost:0
+        --rdzv-endpoint=localhost:0
        --nnodes=1
-        --nproc_per_node=$NUM_TRAINERS
+        --nproc-per-node=$NUM_TRAINERS
        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
@ -116,11 +116,11 @@ Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failur
    torchrun
        --nnodes=$NUM_NODES
-        --nproc_per_node=$NUM_TRAINERS
+        --nproc-per-node=$NUM_TRAINERS
-        --max_restarts=3
+        --max-restarts=3
-        --rdzv_id=$JOB_ID
+        --rdzv-id=$JOB_ID
-        --rdzv_backend=c10d
+        --rdzv-backend=c10d
-        --rdzv_endpoint=$HOST_NODE_ADDR
+        --rdzv-endpoint=$HOST_NODE_ADDR
        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 ``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
@ -137,11 +137,11 @@ Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures)
    torchrun
        --nnodes=1:4
-        --nproc_per_node=$NUM_TRAINERS
+        --nproc-per-node=$NUM_TRAINERS
-        --max_restarts=3
+        --max-restarts=3
-        --rdzv_id=$JOB_ID
+        --rdzv-id=$JOB_ID
-        --rdzv_backend=c10d
+        --rdzv-backend=c10d
-        --rdzv_endpoint=$HOST_NODE_ADDR
+        --rdzv-endpoint=$HOST_NODE_ADDR
        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 ``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
@ -156,10 +156,10 @@ Note on rendezvous backend
 For multi-node training you need to specify:
-1. ``--rdzv_id``: A unique job id (shared by all nodes participating in the job)
+1. ``--rdzv-id``: A unique job id (shared by all nodes participating in the job)
-2. ``--rdzv_backend``: An implementation of
+2. ``--rdzv-backend``: An implementation of
   :py:class:`torch.distributed.elastic.rendezvous.RendezvousHandler`
-3. ``--rdzv_endpoint``: The endpoint where the rendezvous backend is running; usually in form
+3. ``--rdzv-endpoint``: The endpoint where the rendezvous backend is running; usually in form
   ``host:port``.
 Currently ``c10d`` (recommended), ``etcd-v2``, and ``etcd`` (legacy)  rendezvous backends are
@ -221,7 +221,7 @@ The following environment variables are made available to you in your script:
   of the worker is specified in the ``WorkerSpec``.
 5. ``LOCAL_WORLD_SIZE`` - The local world size (e.g. number of workers running locally); equals to
-   ``--nproc_per_node`` specified on ``torchrun``.
+   ``--nproc-per-node`` specified on ``torchrun``.
 6. ``WORLD_SIZE`` - The world size (total number of workers in the job).
@ -246,7 +246,7 @@ Deployment
 ------------
 1. (Not needed for the C10d backend) Start the rendezvous backend server and get the endpoint (to be
-   passed as ``--rdzv_endpoint`` to the launcher script)
+   passed as ``--rdzv-endpoint`` to the launcher script)
 2. Single-node multi-worker: Start the launcher on the host to start the agent process which
   creates and monitors a local worker group.
@ -406,6 +406,7 @@ def get_args_parser() -> ArgumentParser:
        help="Number of nodes, or the range of nodes in form <minimum_nodes>:<maximum_nodes>.",
    )
    parser.add_argument(
        "--nproc-per-node",
        "--nproc_per_node",
        action=env,
        type=str,
@ -418,6 +419,7 @@ def get_args_parser() -> ArgumentParser:
    #
    parser.add_argument(
        "--rdzv-backend",
        "--rdzv_backend",
        action=env,
        type=str,
@ -425,6 +427,7 @@ def get_args_parser() -> ArgumentParser:
        help="Rendezvous backend.",
    )
    parser.add_argument(
        "--rdzv-endpoint",
        "--rdzv_endpoint",
        action=env,
        type=str,
@ -432,6 +435,7 @@ def get_args_parser() -> ArgumentParser:
        help="Rendezvous backend endpoint; usually in form <host>:<port>.",
    )
    parser.add_argument(
        "--rdzv-id",
        "--rdzv_id",
        action=env,
        type=str,
@ -439,6 +443,7 @@ def get_args_parser() -> ArgumentParser:
        help="User-defined group id.",
    )
    parser.add_argument(
        "--rdzv-conf",
        "--rdzv_conf",
        action=env,
        type=str,
@ -450,7 +455,7 @@ def get_args_parser() -> ArgumentParser:
        action=check_env,
        help="Start a local standalone rendezvous backend that is represented by a C10d TCP store "
        "on port 29400. Useful when launching single-node, multi-worker job. If specified "
-        "--rdzv_backend, --rdzv_endpoint, --rdzv_id are auto-assigned; any explicitly set values "
+        "--rdzv-backend, --rdzv-endpoint, --rdzv-id are auto-assigned; any explicitly set values "
        "are ignored.",
    )
@ -459,6 +464,7 @@ def get_args_parser() -> ArgumentParser:
    #
    parser.add_argument(
        "--max-restarts",
        "--max_restarts",
        action=env,
        type=int,
@ -466,6 +472,7 @@ def get_args_parser() -> ArgumentParser:
        help="Maximum number of worker group restarts before failing.",
    )
    parser.add_argument(
        "--monitor-interval",
        "--monitor_interval",
        action=env,
        type=float,
@ -473,6 +480,7 @@ def get_args_parser() -> ArgumentParser:
        help="Interval, in seconds, to monitor the state of workers.",
    )
    parser.add_argument(
        "--start-method",
        "--start_method",
        action=env,
        type=str,
@ -495,6 +503,7 @@ def get_args_parser() -> ArgumentParser:
        "with the same behavior as 'python -m'.",
    )
    parser.add_argument(
        "--no-python",
        "--no_python",
        action=check_env,
        help="Skip prepending the training script with 'python' - just execute it directly. Useful "
@ -502,13 +511,15 @@ def get_args_parser() -> ArgumentParser:
    )
    parser.add_argument(
        "--run-path",
        "--run_path",
        action=check_env,
        help="Run the training script with runpy.run_path in the same interpreter."
        " Script must be provided as an abs path (e.g. /abs/path/script.py)."
-        " Takes precedence over --no_python.",
+        " Takes precedence over --no-python.",
    )
    parser.add_argument(
        "--log-dir",
        "--log_dir",
        action=env,
        type=str,
@ -541,6 +552,7 @@ def get_args_parser() -> ArgumentParser:
    #
    parser.add_argument(
        "--node-rank",
        "--node_rank",
        type=int,
        action=env,
@ -548,16 +560,18 @@ def get_args_parser() -> ArgumentParser:
        help="Rank of the node for multi-node distributed training.",
    )
    parser.add_argument(
        "--master-addr",
        "--master_addr",
        default="127.0.0.1",
        type=str,
        action=env,
        help="Address of the master node (rank 0) that only used for static rendezvous. It should "
        "be either the IP address or the hostname of rank 0. For single node multi-proc training "
-        "the --master_addr can simply be 127.0.0.1; IPv6 should have the pattern "
+        "the --master-addr can simply be 127.0.0.1; IPv6 should have the pattern "
        "`[0:0:0:0:0:0:0:1]`.",
    )
    parser.add_argument(
        "--master-port",
        "--master_port",
        default=29500,
        type=int,
@ -566,6 +580,7 @@ def get_args_parser() -> ArgumentParser:
        "training. It is only used for static rendezvous.",
    )
    parser.add_argument(
        "--local-addr",
        "--local_addr",
        default=None,
        type=str,
@ -652,7 +667,7 @@ def get_use_env(args) -> bool:
    """
    Retrieves ``use_env`` from the args.
    ``use_env`` is a legacy argument, if ``use_env`` is False, the
-    ``--node_rank`` argument will be transferred to all worker processes.
+    ``--node-rank`` argument will be transferred to all worker processes.
    ``use_env`` is only used by the ``torch.distributed.launch`` and will
    be deprecated in future releases.
    """
@ -729,12 +744,12 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
        else:
            if args.module:
                raise ValueError(
-                    "Don't use both the '--no_python' flag"
+                    "Don't use both the '--no-python' flag"
                    " and the '--module' flag at the same time."
                )
            cmd = args.training_script
    if not use_env:
-        cmd_args.append(f"--local_rank={macros.local_rank}")
+        cmd_args.append(f"--local-rank={macros.local_rank}")
    cmd_args.extend(args.training_script_args)
    return config, cmd, cmd_args
@ -760,9 +775,9 @@ def run(args):
        log.info(
            f"\n**************************************\n"
            f"Rendezvous info:\n"
-            f"--rdzv_backend={args.rdzv_backend} "
+            f"--rdzv-backend={args.rdzv_backend} "
-            f"--rdzv_endpoint={args.rdzv_endpoint} "
+            f"--rdzv-endpoint={args.rdzv_endpoint} "
-            f"--rdzv_id={args.rdzv_id}\n"
+            f"--rdzv-id={args.rdzv_id}\n"
            f"**************************************\n"
        )
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@ -43,12 +43,14 @@ class _SplitterSettingBase:
    ):
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "--min-acc-module-size",
            "--min_acc_module_size",
            required=False,
            type=int,
            help="Minimum size limit of an accelerator subgraph.",
        )
        parser.add_argument(
            "--skip-fusion",
            "--skip_fusion",
            default=False,
            action="store_true",
@ -58,6 +60,7 @@ class _SplitterSettingBase:
            "can reduce overhead.",
        )
        parser.add_argument(
            "--allow-non-tensor",
            "--allow_non_tensor",
            default=False,
            action="store_true",
--- a/torch/testing/_internal/codegen/random_topo_test.py
+++ b/torch/testing/_internal/codegen/random_topo_test.py
@ -250,17 +250,17 @@ def prepareInputTensorsToRandomTopoTest(seed,
 def reproString(current_seed, args):
    repro_str = "python {0}".format(__file__)
    if args.cuda_fuser:
-        repro_str += " --cuda_fuser"
+        repro_str += " --cuda-fuser"
    if args.legacy_fuser:
-        repro_str += " --legacy_fuser"
+        repro_str += " --legacy-fuser"
    if args.profiling_executor:
-        repro_str += " --profiling_executor"
+        repro_str += " --profiling-executor"
    if args.fp16:
        repro_str += " --fp16"
    if args.cpu:
        repro_str += " --cpu"
-    repro_str += " --max_num_tensor {0} --max_tensor_dim {1} --max_tensor_size {2}"\
+    repro_str += " --max-num-tensor {0} --max-tensor-dim {1} --max-tensor-size {2}"\
-        " --depth_factor {3} --seed {4} --repro_run".format(
+        " --depth-factor {3} --seed {4} --repro-run".format(
            args.max_num_tensor, args.max_tensor_dim, args.max_tensor_size,
            args.depth_factor, current_seed)
    return repro_str
@ -337,21 +337,21 @@ def runTest(seed, args):
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--cuda_fuser", action='store_true', default=True)
+    parser.add_argument("--cuda-fuser", "--cuda_fuser", action='store_true', default=True)
-    parser.add_argument("--legacy_fuser", action='store_true', default=False)
+    parser.add_argument("--legacy-fuser", "--legacy_fuser", action='store_true', default=False)
-    parser.add_argument("--profiling_executor", action='store_true', default=False)
+    parser.add_argument("--profiling-executor", "--profiling_executor", action='store_true', default=False)
    parser.add_argument("--fp16", action='store_true', default=False)
    parser.add_argument("--cpu", action='store_true', default=False)
-    parser.add_argument("--debug_print", action='store_true', default=False)
+    parser.add_argument("--debug-print", "--debug_print", action='store_true', default=False)
-    parser.add_argument("--debug_tensor", action='store_true', default=False)
+    parser.add_argument("--debug-tensor", "--debug_tensor", action='store_true', default=False)
-    parser.add_argument("--max_num_tensor", default=MAX_TENSOR, type=int)
+    parser.add_argument("--max-num-tensor", "--max_num_tensor", default=MAX_TENSOR, type=int)
-    parser.add_argument("--max_tensor_dim", default=MAX_TENSOR_DIM, type=int)
+    parser.add_argument("--max-tensor-dim", "--max_tensor_dim", default=MAX_TENSOR_DIM, type=int)
-    parser.add_argument("--max_tensor_size", default=MAX_TENSOR_SIZE, type=int)
+    parser.add_argument("--max-tensor-size", "--max_tensor_size", default=MAX_TENSOR_SIZE, type=int)
-    parser.add_argument("--depth_factor", default=GRAPH_FACTOR, type=int)
+    parser.add_argument("--depth-factor", "--depth-factor", default=GRAPH_FACTOR, type=int)
    parser.add_argument("--seed", default=45589, type=int)
    group = parser.add_mutually_exclusive_group()
    group.add_argument("--iterations", default=4, type=int)
-    group.add_argument("--repro_run", action='store_true', default=False)
+    group.add_argument("--repro-run", "--repro_run", action='store_true', default=False)
    return parser.parse_args()
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -501,9 +501,9 @@ parser.add_argument('--subprocess', action='store_true',
                    help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
 parser.add_argument('--accept', action='store_true')
-parser.add_argument('--jit_executor', type=str)
+parser.add_argument('--jit-executor', '--jit_executor', type=str)
 parser.add_argument('--repeat', type=int, default=1)
-parser.add_argument('--test_bailouts', action='store_true')
+parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
 parser.add_argument('--use-pytest', action='store_true')
 parser.add_argument('--save-xml', nargs='?', type=str,
                    const=_get_test_report_path(),
--- a/torch/utils/_freeze.py
+++ b/torch/utils/_freeze.py
@ -253,9 +253,10 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Compile py source")
    parser.add_argument("paths", nargs="*", help="Paths to freeze.")
    parser.add_argument("--verbose", action="store_true", help="Print debug logs")
-    parser.add_argument("--install_dir", help="Root directory for all output files")
+    parser.add_argument("--install-dir", "--install_dir", help="Root directory for all output files")
    parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules")
    parser.add_argument(
        "--symbol-name",
        "--symbol_name",
        help="The name of the frozen module array symbol to generate",
        default="_PyImport_FrozenModules_torch",
--- a/torch/utils/_zip.py
+++ b/torch/utils/_zip.py
@ -40,10 +40,12 @@ def write_to_zip(file_path, strip_file_path, zf, prepend_str=""):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Zip py source")
    parser.add_argument("paths", nargs="*", help="Paths to zip.")
-    parser.add_argument("--install_dir", help="Root directory for all output files")
+    parser.add_argument("--install-dir", "--install_dir", help="Root directory for all output files")
-    parser.add_argument("--strip_dir", help="The absolute directory we want to remove from zip")
+    parser.add_argument("--strip-dir", "--strip_dir", help="The absolute directory we want to remove from zip")
-    parser.add_argument("--prepend_str", help="A string to prepend onto all paths of a file in the zip", default="")
+    parser.add_argument(
-    parser.add_argument("--zip_name", help="Output zip name")
+        "--prepend-str", "--prepend_str", help="A string to prepend onto all paths of a file in the zip", default=""
    )
    parser.add_argument("--zip-name", "--zip_name", help="Output zip name")
    args = parser.parse_args()
--- a/torch/utils/benchmark/examples/blas_compare.py
+++ b/torch/utils/benchmark/examples/blas_compare.py
@ -123,12 +123,12 @@ def run_subprocess(args):
            f"source activate {env} && "
            f"taskset --cpu-list {core_str} "
            f"python {os.path.abspath(__file__)} "
-            "--DETAIL_in_subprocess "
+            "--DETAIL-in-subprocess "
-            f"--DETAIL_seed {seed} "
+            f"--DETAIL-seed {seed} "
-            f"--DETAIL_num_threads {num_threads} "
+            f"--DETAIL-num-threads {num_threads} "
-            f"--DETAIL_sub_label '{sub_label}' "
+            f"--DETAIL-sub-label '{sub_label}' "
-            f"--DETAIL_result_file {result_file} "
+            f"--DETAIL-result-file {result_file} "
-            f"--DETAIL_env {env}",
+            f"--DETAIL-env {env}",
            env=env_vars,
            stdout=subprocess.PIPE,
            shell=True
@ -197,7 +197,7 @@ def main():
    subprocess.run(
        f"source activate {env_path} && "
        f"python {os.path.abspath(__file__)} "
-        "--DETAIL_in_compare",
+        "--DETAIL-in-compare",
        shell=True
    )
@ -205,13 +205,13 @@ def main():
 if __name__ == "__main__":
    # These flags are for subprocess control, not controlling the main loop.
    parser = argparse.ArgumentParser()
-    parser.add_argument("--DETAIL_in_subprocess", action="store_true")
+    parser.add_argument("--DETAIL-in-subprocess", "--DETAIL_in_subprocess", action="store_true")
-    parser.add_argument("--DETAIL_in_compare", action="store_true")
+    parser.add_argument("--DETAIL-in-compare", "--DETAIL_in_compare", action="store_true")
-    parser.add_argument("--DETAIL_seed", type=int, default=None)
+    parser.add_argument("--DETAIL-seed", "--DETAIL_seed", type=int, default=None)
-    parser.add_argument("--DETAIL_num_threads", type=int, default=None)
+    parser.add_argument("--DETAIL-num-threads", "--DETAIL_num_threads", type=int, default=None)
-    parser.add_argument("--DETAIL_sub_label", type=str, default="N/A")
+    parser.add_argument("--DETAIL-sub-label", "--DETAIL_sub_label", type=str, default="N/A")
-    parser.add_argument("--DETAIL_result_file", type=str, default=None)
+    parser.add_argument("--DETAIL-result-file", "--DETAIL_result_file", type=str, default=None)
-    parser.add_argument("--DETAIL_env", type=str, default=None)
+    parser.add_argument("--DETAIL-env", "--DETAIL_env", type=str, default=None)
    args = parser.parse_args()
    if args.DETAIL_in_subprocess:
--- a/torch/utils/benchmark/examples/end_to_end.py
+++ b/torch/utils/benchmark/examples/end_to_end.py
@ -82,15 +82,15 @@ _DTYPE_STR_TO_DTYPE = {
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pr", type=str, default=_PR_LIST[0], choices=_PR_LIST)
-    parser.add_argument("--num_gpus", type=int, default=None)
+    parser.add_argument("--num-gpus", "--num_gpus", type=int, default=None)
-    parser.add_argument("--test_variance", action="store_true")
+    parser.add_argument("--test-variance", "--test_variance", action="store_true")
    # (Implementation details)
-    parser.add_argument("--DETAIL_context", type=str, choices=(_MAIN, _SUBPROCESS), default=_MAIN)
+    parser.add_argument("--DETAIL-context", "--DETAIL_context", type=str, choices=(_MAIN, _SUBPROCESS), default=_MAIN)
-    parser.add_argument("--DETAIL_device", type=str, choices=(_CPU, _GPU), default=None)
+    parser.add_argument("--DETAIL-device", "--DETAIL_device", type=str, choices=(_CPU, _GPU), default=None)
-    parser.add_argument("--DETAIL_env", type=str, default=None)
+    parser.add_argument("--DETAIL-env", "--DETAIL_env", type=str, default=None)
-    parser.add_argument("--DETAIL_result_file", type=str, default=None)
+    parser.add_argument("--DETAIL-result-file", "--DETAIL_result_file", type=str, default=None)
-    parser.add_argument("--DETAIL_seed", type=int, default=None)
+    parser.add_argument("--DETAIL-seed", "--DETAIL_seed", type=int, default=None)
    args = parser.parse_args()
    if args.num_gpus is None:
@ -101,11 +101,11 @@ def parse_args():
 _SUBPROCESS_CMD_TEMPLATE = (
    "source activate {source_env} && python -m examples.end_to_end "
    "--pr {pr} "
-    "--DETAIL_context subprocess "
+    "--DETAIL-context subprocess "
-    "--DETAIL_device {device} "
+    "--DETAIL-device {device} "
-    "--DETAIL_env {env} "
+    "--DETAIL-env {env} "
-    "--DETAIL_result_file {result_file} "
+    "--DETAIL-result-file {result_file} "
-    "--DETAIL_seed {seed}"
+    "--DETAIL-seed {seed}"
 )
--- a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
+++ b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@ -87,7 +87,7 @@ if __name__ == '__main__':
    parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--samples', type=int, default=10)
-    parser.add_argument('--probability_regular', type=float, default=1.0)
+    parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
    parser.add_argument('-o', '--output', type=str)
    args = parser.parse_args()
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
@ -28,13 +28,17 @@ int main(int argc, char* argv[]) {
  TORCH_CHECK(std::string(argv[1]) == "--number");
  auto number = std::stoi(argv[2]);
-  TORCH_CHECK(std::string(argv[3]) == "--number_warmup");
+  TORCH_CHECK(
      std::string(argv[3]) == "--number-warmup" ||
      std::string(argv[3]) == "--number_warmup");
  auto number_warmup = std::stoi(argv[4]);
  TORCH_CHECK(std::string(argv[5]) == "--repeats");
  auto repeats = std::stoi(argv[6]);
-  TORCH_CHECK(std::string(argv[7]) == "--number_threads");
+  TORCH_CHECK(
      std::string(argv[7]) == "--number-threads" ||
      std::string(argv[7]) == "--number_threads");
  auto number_threads = std::stoi(argv[8]);
  torch::set_num_threads(number_threads);
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@ -635,9 +635,9 @@ class _ValgrindWrapper:
                run_loop_cmd = [
                    run_loop_exec,
                    "--number", str(number),
-                    "--number_warmup", str(min(number, 10)),
+                    "--number-warmup", str(min(number, 10)),
                    "--repeats", str(repeats),
-                    "--number_threads", str(task_spec.num_threads),
+                    "--number-threads", str(task_spec.num_threads),
                ]
            valgrind_invocation, valgrind_invocation_output = run([
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@ -2611,7 +2611,11 @@ def main() -> None:
        help="generate separate headers per operator in ATen/ops",
    )
    parser.add_argument(
-        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+        "-d",
        "--install-dir",
        "--install_dir",
        help="output directory",
        default="build/aten/src/ATen",
    )
    parser.add_argument(
        "--rocm",
@ -2623,10 +2627,11 @@ def main() -> None:
        action="store_true",
        help="Generate MPS registration code when set",
    )
-    # TODO: --op_registration_whitelist will be removed when all call-sites
+    # TODO: --op-registration-whitelist will be removed when all call-sites
    # for gen.py are moved over to using the operator YAML file for mobile
    # custom build.
    parser.add_argument(
        "--op-registration-whitelist",
        "--op_registration_whitelist",
        nargs="*",
        help="filter op registrations by the whitelist (if set); "
@ -2634,6 +2639,7 @@ def main() -> None:
        "e.g.: aten::empty aten::conv2d ...",
    )
    parser.add_argument(
        "--op-selection-yaml-path",
        "--op_selection_yaml_path",
        help="Provide a path to the operator selection (for custom build) YAML "
        "that contains the information about the set of selected operators "
@ -2642,26 +2648,30 @@ def main() -> None:
        "The operator names also contain the namespace prefix (e.g. aten::)",
    )
    parser.add_argument(
        "--backend-whitelist",
        "--backend_whitelist",
        nargs="*",
        help="filter dispatch backend by the whitelist (if set), "
        "e.g.: CPU CUDA QuantizedCPU ...",
    )
    parser.add_argument(
        "--static-dispatch-backend",
        "--static_dispatch_backend",
        nargs="*",
        help="generate static dispatch code for the specific backend (if set)",
    )
    parser.add_argument(
        "--skip-dispatcher-op-registration",
        "--skip_dispatcher_op_registration",
        action="store_true",
        help="Avoid registering operators into the dispatcher.",
    )
    parser.add_argument(
        "--force-schema-registration",
        "--force_schema_registration",
        action="store_true",
        help="force it to generate schema-only registrations for all ops, including"
-        "those that are not listed on --op_registration_whitelist",
+        "those that are not listed on --op-registration-whitelist",
    )
    parser.add_argument(
        "--generate",
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@ -339,12 +339,16 @@ def main() -> None:
    parser = argparse.ArgumentParser(description="Generate backend stub files")
    parser.add_argument(
        "-s",
        "--source-yaml",
        "--source_yaml",
        help="path to source yaml file containing operator external definitions",
    )
-    parser.add_argument("-o", "--output_dir", help="output directory")
+    parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory")
    parser.add_argument("--dry_run", type=bool, default=False, help="output directory")
    parser.add_argument(
        "--dry-run", "--dry_run", type=bool, default=False, help="output directory"
    )
    parser.add_argument(
        "--impl-path",
        "--impl_path",
        type=str,
        default=None,
--- a/torchgen/gen_executorch.py
+++ b/torchgen/gen_executorch.py
@ -626,24 +626,31 @@ def main() -> None:
        help="path to source directory for kernel templates",
    )
    parser.add_argument(
        "--functions-yaml-path",
        "--functions_yaml_path",
        help="path to the functions.yaml file to use. Optional, but at least "
-        "one of --functions_yaml_path and --custom_ops_yaml_path must be "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
        "specified.",
    )
    parser.add_argument(
        "--custom-ops-yaml-path",
        "--custom_ops_yaml_path",
        help="path to the custom_ops.yaml file to use. Optional, but at least "
-        "one of --functions_yaml_path and --custom_ops_yaml_path must be "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
        "specified.",
    )
    parser.add_argument(
        "--aten-yaml-path",
        "--aten_yaml_path",
        help="path to native_functions.yaml file.",
    )
    # Note that make_file_manager() also looks at --install-dir.
    parser.add_argument(
-        "-d", "--install_dir", help="output directory", default="build/generated"
+        "-d",
        "--install-dir",
        "--install_dir",
        help="output directory",
        default="build/generated",
    )
    parser.add_argument(
        "-o",
@ -658,11 +665,13 @@ def main() -> None:
        help="run without writing any files (still updates outputs)",
    )
    parser.add_argument(
        "--static-dispatch-backend",
        "--static_dispatch_backend",
        nargs="*",
        help="generate static dispatch code for the specific backend (if set)",
    )
    parser.add_argument(
        "--op-registration-whitelist",
        "--op_registration_whitelist",
        nargs="*",
        help="filter op registrations by the whitelist (if set); "
@ -670,6 +679,7 @@ def main() -> None:
        "e.g.: aten::empty aten::conv2d ...",
    )
    parser.add_argument(
        "--op-selection-yaml-path",
        "--op_selection_yaml_path",
        help="Provide a path to the operator selection (for custom build) YAML "
        "that contains the information about the set of selected operators "
@ -687,6 +697,7 @@ def main() -> None:
        help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly",
    )
    parser.add_argument(
        "--use-aten-lib",
        "--use_aten_lib",
        action="store_true",
        help="a boolean flag to indicate whether we use ATen kernels or not, in the future this flag will be per "
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@ -210,53 +210,64 @@ def main() -> None:
    parser = argparse.ArgumentParser(description="Generate Lazy Tensor backend files")
    parser.add_argument(
        "-s",
        "--source-yaml",
        "--source_yaml",
        help="path to source yaml file containing operator external definitions",
    )
-    parser.add_argument("-o", "--output_dir", help="output directory")
+    parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory")
    parser.add_argument("--dry_run", type=bool, default=False, help="output directory")
    parser.add_argument(
        "--dry-run", "--dry_run", type=bool, default=False, help="output directory"
    )
    parser.add_argument(
        "--impl-path",
        "--impl_path",
        type=str,
        default=None,
        help="path to the source C++ file containing kernel definitions",
    )
    parser.add_argument(
        "--gen-ts-lowerings",
        "--gen_ts_lowerings",
        action="store_true",
        help="Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions",
    )
    parser.add_argument(
        "--node-base",
        "--node_base",
        type=str,
        default=default_args.node_base,
        help="Name of backend specific custom Lazy IR Node base class",
    )
    parser.add_argument(
        "--node-base-hdr",
        "--node_base_hdr",
        type=str,
        default=default_args.node_base_hdr,
        help="Path to header file defining custom Lazy IR Node base class",
    )
    parser.add_argument(
        "--shape-inference-hdr",
        "--shape_inference_hdr",
        type=str,
        default=default_args.shape_inference_hdr,
        help="Path to header file defining custom Lazy shape inference functions",
    )
    parser.add_argument(
        "--tensor-class",
        "--tensor_class",
        type=str,
        default=default_args.tensor_class,
        help="Name of backend specific custom Lazy Tensor class",
    )
    parser.add_argument(
        "--tensor-class-hdr",
        "--tensor_class_hdr",
        type=str,
        default=default_args.tensor_class_hdr,
        help="Path to header file defining custom Lazy Tensor class",
    )
    parser.add_argument(
        "--backend-name",
        "--backend_name",
        type=str,
        default=default_args.backend_name,