Revert "Fix global flake8 issues (#124771)"

This reverts commit f01275934b. Reverted https://github.com/pytorch/pytorch/pull/124771 on behalf of https://github.com/jeanschmidt due to Unfortunately, I needed to revert #123735 and this one depends on it. So please check if there are no merge conflicts or breakages and feel free to merge this PR again ([comment](https://github.com/pytorch/pytorch/pull/124428#issuecomment-2078699836))
2025-12-06 12:20:52 +01:00 · 2024-04-26 06:15:17 +00:00 · 2024-04-26 06:15:17 +00:00 · 1ac60484c1
commit 1ac60484c1
parent e607dc8abb
55 changed files with 211 additions and 213 deletions
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -29,7 +29,7 @@ def parse_args() -> Any:
        "--onto-branch", type=str, required=True, help="the target release branch"
    )
    parser.add_argument(
-        "--github-actor", type=str, required=True, help="all the world's a stage"
+        "--github-actor", type=str, required=True, help="all the world’s a stage"
    )
    parser.add_argument(
        "--classification",
--- a/benchmarks/transformer/better_transformer_vs_mha_functional.py
+++ b/benchmarks/transformer/better_transformer_vs_mha_functional.py
@ -152,8 +152,8 @@ def run(
    result_entry["sequence_length"] = sequence_length
    result_entry["n_heads"] = num_heads
    result_entry["embed_dim"] = embed_dim
-    result_entry["time_native_mha_slow(\u00B5s)"] = f"{time_native_mha_slow:.3f}"
-    result_entry["time_native_mha_fast (\u00B5s)"] = f"{time_native_mha_fast:.3f}"
+    result_entry["time_native_mha_slow(μs)"] = f"{time_native_mha_slow:.3f}"
+    result_entry["time_native_mha_fast (μs)"] = f"{time_native_mha_fast:.3f}"
    result_entry["speedup flash_mha v native_mha"] = f"{speedup_fast_internal:.3f}"
    result_entry["padding"] = f"{padding:.3f}"
    return result_entry
--- a/benchmarks/transformer/sdp.py
+++ b/benchmarks/transformer/sdp.py
@ -81,10 +81,10 @@ class ExperimentResults:
    @classmethod
    def get_entry_names(cls) -> List[str]:
        return [
-            "nn_mha_time (\u00B5s)",
-            "compiled_nn_mha_time (\u00B5s)",
-            "composite_mha_time (\u00B5s)",
-            "compiled_composite_mha_time (\u00B5s)",
+            "nn_mha_time (μs)",
+            "compiled_nn_mha_time (μs)",
+            "composite_mha_time (μs)",
+            "compiled_composite_mha_time (μs)",
        ]


--- a/functorch/einops/_parsing.py
+++ b/functorch/einops/_parsing.py
@ -28,7 +28,7 @@ import keyword
 import warnings
 from typing import Collection, List, Mapping, Optional, Set, Tuple, Union

-_ellipsis: str = "\u2026"  # NB, this is a single unicode symbol. String is used as it is not a list, but can be iterated
+_ellipsis: str = "…"  # NB, this is a single unicode symbol. String is used as it is not a list, but can be iterated


 class AnonymousAxis:
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@ -3752,11 +3752,11 @@ class TestDistributions(DistributionsTestCase):

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_dirichlet_log_prob_zero(self):
-        # Specifically test the special case where x=0 and alpha=1.  The PDF is
-        # proportional to x**(alpha-1), which in this case works out to 0**0=1.
+        # Specifically test the special case where x=0 and α=1.  The PDF is
+        # proportional to x**(α-1), which in this case works out to 0**0=1.
        # The log PDF of this term should therefore be 0.  However, it's easy
        # to accidentally introduce NaNs by calculating log(x) without regard
-        # for the value of alpha-1.
+        # for the value of α-1.
        alpha = torch.tensor([1, 2])
        dist = Dirichlet(alpha)
        x = torch.tensor([0, 1])
--- a/test/functorch/test_parsing.py
+++ b/test/functorch/test_parsing.py
@ -107,7 +107,7 @@ class TestParsedExpression(TestCase):
            ParsedExpression("(a) ((b c) (d ...))")

        # invalid identifiers
-        ParsedExpression("camelCase under_scored cApiTaLs \u00DF ...")
+        ParsedExpression("camelCase under_scored cApiTaLs ß ...")
        with self.assertRaises(ValueError):
            ParsedExpression("1a")
        with self.assertRaises(ValueError):
--- a/test/inductor/test_templated_attention.py
+++ b/test/inductor/test_templated_attention.py
@ -308,8 +308,8 @@ class TestTemplatedSDPA(InductorTestCase):
        # this means that the base for the LSE computed by ref is e while for the compiled
        # version it is 2. To compare we use the change of base formula
        # log_2(x_compiled) = log_e(x_ref) * log_2(e) where
-        # x_ref      = sum(_i e^(scores[i]))
-        # x_compiled = sum(_i 2^(log2(e) * scores[i]))
+        # x_ref      = ∑_i e^(scores[i])
+        # x_compiled = ∑_i 2^(log2(e) * scores[i])

        self.assertTrue(ref_lse.dtype == torch.float32)
        self.assertTrue(compiled_lse.dtype == torch.float32)
--- a/test/package/test_directory_reader.py
+++ b/test/package/test_directory_reader.py
@ -111,16 +111,16 @@ class DirectoryReaderTest(PackageTestCase):
        with PackageExporter(filename) as pe:
            # Layout looks like:
            #    package
-            #    |-- one/
-            #    |   |-- a.txt
-            #    |   |-- b.txt
-            #    |   |-- c.txt
-            #    |   +-- three/
-            #    |       |-- d.txt
-            #    |       +-- e.txt
-            #    +-- two/
-            #       |-- f.txt
-            #       +-- g.txt
+            #    ├── one/
+            #    │   ├── a.txt
+            #    │   ├── b.txt
+            #    │   ├── c.txt
+            #    │   └── three/
+            #    │       ├── d.txt
+            #    │       └── e.txt
+            #    └── two/
+            #       ├── f.txt
+            #       └── g.txt
            pe.save_text("one", "a.txt", "hello, a!")
            pe.save_text("one", "b.txt", "hello, b!")
            pe.save_text("one", "c.txt", "hello, c!")
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@ -38,46 +38,46 @@ class TestMisc(PackageTestCase):

        export_plain = dedent(
            """\
-                \u251c\u2500\u2500 .data
-                \u2502   \u251c\u2500\u2500 extern_modules
-                \u2502   \u251c\u2500\u2500 python_version
-                \u2502   \u251c\u2500\u2500 serialization_id
-                \u2502   \u2514\u2500\u2500 version
-                \u251c\u2500\u2500 main
-                \u2502   \u2514\u2500\u2500 main
-                \u251c\u2500\u2500 obj
-                \u2502   \u2514\u2500\u2500 obj.pkl
-                \u251c\u2500\u2500 package_a
-                \u2502   \u251c\u2500\u2500 __init__.py
-                \u2502   \u2514\u2500\u2500 subpackage.py
-                \u251c\u2500\u2500 byteorder
-                \u2514\u2500\u2500 module_a.py
+                ├── .data
+                │   ├── extern_modules
+                │   ├── python_version
+                │   ├── serialization_id
+                │   └── version
+                ├── main
+                │   └── main
+                ├── obj
+                │   └── obj.pkl
+                ├── package_a
+                │   ├── __init__.py
+                │   └── subpackage.py
+                ├── byteorder
+                └── module_a.py
            """
        )
        export_include = dedent(
            """\
-                \u251c\u2500\u2500 obj
-                \u2502   \u2514\u2500\u2500 obj.pkl
-                \u2514\u2500\u2500 package_a
-                    \u2514\u2500\u2500 subpackage.py
+                ├── obj
+                │   └── obj.pkl
+                └── package_a
+                    └── subpackage.py
            """
        )
        import_exclude = dedent(
            """\
-                \u251c\u2500\u2500 .data
-                \u2502   \u251c\u2500\u2500 extern_modules
-                \u2502   \u251c\u2500\u2500 python_version
-                \u2502   \u251c\u2500\u2500 serialization_id
-                \u2502   \u2514\u2500\u2500 version
-                \u251c\u2500\u2500 main
-                \u2502   \u2514\u2500\u2500 main
-                \u251c\u2500\u2500 obj
-                \u2502   \u2514\u2500\u2500 obj.pkl
-                \u251c\u2500\u2500 package_a
-                \u2502   \u251c\u2500\u2500 __init__.py
-                \u2502   \u2514\u2500\u2500 subpackage.py
-                \u251c\u2500\u2500 byteorder
-                \u2514\u2500\u2500 module_a.py
+                ├── .data
+                │   ├── extern_modules
+                │   ├── python_version
+                │   ├── serialization_id
+                │   └── version
+                ├── main
+                │   └── main
+                ├── obj
+                │   └── obj.pkl
+                ├── package_a
+                │   ├── __init__.py
+                │   └── subpackage.py
+                ├── byteorder
+                └── module_a.py
            """
        )

--- a/test/package/test_resources.py
+++ b/test/package/test_resources.py
@ -25,16 +25,16 @@ class TestResources(PackageTestCase):
        with PackageExporter(buffer) as pe:
            # Layout looks like:
            #    package
-            #    |-- one/
-            #    |   |-- a.txt
-            #    |   |-- b.txt
-            #    |   |-- c.txt
-            #    |   +-- three/
-            #    |       |-- d.txt
-            #    |       +-- e.txt
-            #    +-- two/
-            #       |-- f.txt
-            #       +-- g.txt
+            #    ├── one/
+            #    │   ├── a.txt
+            #    │   ├── b.txt
+            #    │   ├── c.txt
+            #    │   └── three/
+            #    │       ├── d.txt
+            #    │       └── e.txt
+            #    └── two/
+            #       ├── f.txt
+            #       └── g.txt
            pe.save_text("one", "a.txt", "hello, a!")
            pe.save_text("one", "b.txt", "hello, b!")
            pe.save_text("one", "c.txt", "hello, c!")
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -15679,7 +15679,7 @@ dedent """
    def test_unicode_comments(self):
        @torch.jit.script
        def test(self, a):
-            # shrug
+            # 🤷🤷🤷🤷
            return torch.nn.functional.relu(a)

    def test_get_set_state_with_tensors(self):
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@ -70,7 +70,7 @@ class TestFuser(JitTestCase):
    @unittest.skipIf(IS_SANDCASTLE, "NYI: fuser CPU support for Sandcastle")
    @enable_cpu_fuser
    def test_abs_cpu_unicode_temp_dir(self):
-        with TemporaryDirectoryName(suffix='\u4e2d\u6587') as dname:
+        with TemporaryDirectoryName(suffix='中文') as dname:
            shell_env = os.environ.copy()
            shell_env['TMP'] = dname
            cmd = [sys.executable, os.path.basename(__file__), type(self).__name__ + '.test_abs_cpu']
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -1950,7 +1950,7 @@ class TestLinalg(TestCase):

        # if out tensor with floating dtype is passed for complex output an error is thrown
        if not dtype.is_complex:
-            # The characteristic equation is p(lambda) = lambda^2 - 2lambda + 5 = 0, with roots lambda = 1[+-]2i
+            # The characteristic equation is p(λ) = λ^2 − 2λ + 5 = 0, with roots λ = 1±2i
            a = torch.tensor([[3., -2.], [4., -1.]], dtype=dtype, device=device)
            out0 = torch.empty(0, device=device, dtype=dtype)
            out1 = torch.empty(0, device=device, dtype=dtype)
@ -2117,7 +2117,7 @@ class TestLinalg(TestCase):

        # if out tensor with floating dtype is passed for complex output an error is thrown
        if not dtype.is_complex:
-            # The characteristic equation is p(lambda) = lambda^2 - 2lambda + 5 = 0, with roots lambda = 1[+-]2i
+            # The characteristic equation is p(λ) = λ^2 − 2λ + 5 = 0, with roots λ = 1±2i
            a = torch.tensor([[3., -2.], [4., -1.]], dtype=dtype, device=device)
            out = torch.empty(0, device=device, dtype=dtype)
            with self.assertRaisesRegex(RuntimeError, "Expected eigenvalues to be safely castable"):
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@ -428,7 +428,7 @@ class TestPublicBindings(TestCase):
    def test_correct_module_names(self):
        '''
        An API is considered public, if  its  `__module__` starts with `torch.`
-        and there is no name in `__module__` or the object itself that starts with "_".
+        and there is no name in `__module__` or the object itself that starts with “_”.
        Each public package should either:
        - (preferred) Define `__all__` and all callables and classes in there must have their
         `__module__` start with the current submodule's path. Things not in `__all__` should
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@ -924,7 +924,7 @@ class TestSerialization(TestCase, SerializationMixin):
            test(fname)

        if IS_FILESYSTEM_UTF8_ENCODING:
-            with TemporaryDirectoryName(suffix='\u975eASCII\u30d1\u30b9') as dname:
+            with TemporaryDirectoryName(suffix='非ASCIIパス') as dname:
                with TemporaryFileName(dir=dname) as fname:
                    test(fname)

--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -8046,7 +8046,7 @@ class TestTorch(TestCase):
            assert_with_filename(fname)

        if IS_FILESYSTEM_UTF8_ENCODING:
-            with TemporaryDirectoryName(suffix='\u4e2d\u6587') as dname, TemporaryFileName(dir=dname) as fname:
+            with TemporaryDirectoryName(suffix='中文') as dname, TemporaryFileName(dir=dname) as fname:
                assert_with_filename(fname)

    def test_torch_from_file(self):
@ -8077,7 +8077,7 @@ class TestTorch(TestCase):
            assert_with_filename(fname)

        if IS_FILESYSTEM_UTF8_ENCODING:
-            with TemporaryDirectoryName(suffix='\u4e2d\u6587') as dname, TemporaryFileName(dir=dname) as fname:
+            with TemporaryDirectoryName(suffix='中文') as dname, TemporaryFileName(dir=dname) as fname:
                assert_with_filename(fname)

    def test_print(self):
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@ -744,7 +744,7 @@ def slice_forward(
        raise RuntimeError("slice step must be positive")

    start_val = start if start is not None else 0
-    end_val = end if end is not None else sys.maxsize  # 2^63 - 1
+    end_val = end if end is not None else sys.maxsize  # 2^63 – 1

    if start_val < 0:
        start_val += sizes[dim]
--- a/torch/_dynamo/polyfill.py
+++ b/torch/_dynamo/polyfill.py
@ -57,7 +57,7 @@ def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequenc


 def dropwhile(predicate, iterable):
-    # dropwhile(lambda x: x<5, [1,4,6,4,1]) -> 6 4 1
+    # dropwhile(lambda x: x<5, [1,4,6,4,1]) → 6 4 1
    iterable = iter(iterable)
    for x in iterable:
        if not predicate(x):
--- a/torch/_export/error.py
+++ b/torch/_export/error.py
@ -5,13 +5,13 @@ class ExportErrorType(Enum):
    # User providing invalid inputs to either tracer, or other public facing APIs
    INVALID_INPUT_TYPE = 1

-    # User returning values from their models that we don't support.
+    # User returning values from their models that we don’t support.
    INVALID_OUTPUT_TYPE = 2

    # Generated IR does not conform to Export IR Specification.
    VIOLATION_OF_SPEC = 3

-    # User's code contains types and functionalities we don't support.
+    # User’s code contains types and functionalities we don’t support.
    NOT_SUPPORTED = 4

    # User's code didn't provide necessary details for us to successfully trace and export.
--- a/torch/_functorch/autograd_function.py
+++ b/torch/_functorch/autograd_function.py
@ -498,7 +498,7 @@ def get_tangents_in_dims(input_dims, tangents):
 # in_dims = 0
 # vmap(Sum.apply, in_dims)(x)
 #
-# Let's assume for a moment that we didn't vmap setup_context in VmappedSum:
+# Let’s assume for a moment that we didn’t vmap setup_context in VmappedSum:
 #
 # class VmappedSum(torch.autograd.Function):
 #    @staticmethod
@ -519,7 +519,7 @@ def get_tangents_in_dims(input_dims, tangents):
 #        return gx
 #
 # We end up saving [B, 4] as x_shape. In the backward, gy has shape [B],
-# and we're doing:
+# and we’re doing:
 #
 # def backward_no_context(gy):
 #     return gy.expand([B, 4])
--- a/torch/_inductor/codegen/memory_planning.py
+++ b/torch/_inductor/codegen/memory_planning.py
@ -62,8 +62,8 @@ class LiveRange:
    Invariant: begin <= end
    """

-    begin: float  # int | +/-inf
-    end: float  # int | +/-inf
+    begin: float  # int | ±inf
+    end: float  # int | ±inf

    def contains(self, other: LiveRange):
        """Is other entirely within self"""
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@ -5373,7 +5373,7 @@ def meta__scaled_dot_product_flash_attention_for_cpu_backward(
    scale: Optional[float] = None,
 ):
    # cpus's grad layout is different from cuda's,
-    # i.e. (batch_size, seq_len,num_heads, head_dim)
+    # i.e. (batch_size, seq_len，num_heads, head_dim）
    batch_size = query.size(0)
    num_heads = query.size(1)
    head_dim = query.size(3)
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@ -2008,7 +2008,7 @@ def min_scalar_type(a: ArrayLike, /):
    from ._dtypes import DType

    if a.numel() > 1:
-        # numpy docs: "For non-scalar array a, returns the vector's dtype unmodified."
+        # numpy docs: "For non-scalar array a, returns the vector’s dtype unmodified."
        return DType(a.dtype)

    if a.dtype == torch.bool:
--- a/torch/_refs/init.py
+++ b/torch/_refs/init.py
@ -485,7 +485,7 @@ def _make_alias(fn, name):
    """
    This function defines an alias of another function and sets its __name__ argument.
    It also sets its __module__ argument to the module of the caller.
-    Note that when naively doing `alias = fn`, we have that `alias.__name__ == "fn"`, and
+    Note that when naïvely doing `alias = fn`, we have that `alias.__name__ == "fn"`, and
    `alias.__module__ == fn.__module__`.
    """

--- a/torch/_refs/nn/functional/init.py
+++ b/torch/_refs/nn/functional/init.py
@ -600,7 +600,7 @@ def margin_ranking_loss(
    margin: float = 0.0,
    reduction: str = "mean",
 ) -> TensorLikeType:
-    # loss_without_reduction = max(0, -target * (input1 - input2) + margin)
+    # loss_without_reduction = max(0, −target * (input1 − input2) + margin)
    if input1.ndim != input2.ndim or input1.ndim != target.ndim:
        raise RuntimeError(
            "margin_ranking_loss : All input tensors should have same dimension but got sizes: "
--- a/torch/_refs/special/init.py
+++ b/torch/_refs/special/init.py
@ -116,7 +116,7 @@ def i1e(a: TensorLikeType) -> TensorLikeType:
    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def log_ndtr(a: TensorLikeType) -> TensorLikeType:
-    # Note: M_SQRT1_2 is the value of 1 / sqrt(2)
+    # Note: M_SQRT1_2 is the value of 1 / √2
    M_SQRT1_2 = 0.707106781186547524400844362104849039
    t = a * M_SQRT1_2
    return torch.where(
@ -185,7 +185,7 @@ def multigammaln(a: TensorLikeType, p: int) -> TensorLikeType:
    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def ndtr(a: TensorLikeType) -> TensorLikeType:
-    # Note: M_SQRT1_2 is the value of 1 / sqrt(2)
+    # Note: M_SQRT1_2 is the value of 1 / √2
    M_SQRT1_2 = 0.707106781186547524400844362104849039
    a_sqrt_2 = a * M_SQRT1_2
    return (1 + torch.erf(a_sqrt_2)) * 0.5
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -2305,8 +2305,8 @@ Keyword Args:
        times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`.
        Must have integral dtype. Ignored if ``None``. Defaults to ``None``.
    aweights (tensor, optional): A Scalar or 1D array of observation vector weights.
-        These relative weights are typically large for observations considered "important" and smaller for
-        observations considered less "important". Its numel must equal the number of columns of :attr:`input`.
+        These relative weights are typically large for observations considered “important” and smaller for
+        observations considered less “important”. Its numel must equal the number of columns of :attr:`input`.
        Must have floating point dtype. Ignored if ``None``. Defaults to ``None``.

 Returns:
@ -4773,7 +4773,7 @@ This is detailed in the "Keyword Arguments" section below.
 The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
 accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
 improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
-is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
 Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
 it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:

--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@ -79,12 +79,12 @@ class DTypeWithConstraints:

    * `quant_min_lower_bound` and `quant_max_upper_bound`: Lower and upper
      bounds for the minimum and maximum quantized values respectively. If
-      the QConfig's `quant_min` and `quant_max` fall outside this range,
+      the QConfig’s `quant_min` and `quant_max` fall outside this range,
      then the QConfig will be ignored.

    * `scale_min_lower_bound` and `scale_max_upper_bound`: Lower and upper
      bounds for the minimum and maximum scale values respectively. If the
-      QConfig's minimum scale value (currently exposed as `eps`) falls below
+      QConfig’s minimum scale value (currently exposed as `eps`) falls below
      the lower bound, then the QConfig will be ignored. Note that the upper
      bound is currently not enforced.

@ -130,7 +130,7 @@ class DTypeConfig:
    dtypes here are the same as the semantics of the dtypes specified in
    the observers.

-    These dtypes are matched against the ones specified in the user's
+    These dtypes are matched against the ones specified in the user’s
    QConfig. If there is a match, and the QConfig satisfies the constraints
    specified in the DTypeConfig (if any), then we will quantize the given
    pattern using this DTypeConfig. Otherwise, the QConfig is ignored and
--- a/torch/distributed/_shard/sharded_tensor/init.py
+++ b/torch/distributed/_shard/sharded_tensor/init.py
@ -187,7 +187,7 @@ def full(sharding_spec: ShardingSpec,
         process_group=None,
         init_rrefs=False) -> ShardedTensor:
    """
-    Creates a :class:`ShardedTensor` filled with fill_value. The tensor's dtype
+    Creates a :class:`ShardedTensor` filled with fill_value. The tensor’s dtype
        is inferred from fill_value. If dtype is specified, it will override the
        inferred type from fill_value. Needs to be called on all ranks in an SPMD fashion.
    Args:
@ -195,7 +195,7 @@ def full(sharding_spec: ShardingSpec,
            describing how to shard the Tensor.
        size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
            output tensor.
-        fill_value (Scalar) - the value to fill the output tensor with.
+        fill_value (Scalar) – the value to fill the output tensor with.
    Keyword args:
        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
@ -117,7 +117,7 @@ def _handle_col_wise_sharding_base(
        padding_idx: If specified, the entries at padding_idx do
            not contribute to the gradient; therefore, the embedding
            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed "pad".
+            i.e. it remains as a fixed “pad”.
            Note that the embedding vector at padding_idx is
            excluded from the reduction.

@ -312,7 +312,7 @@ def _handle_row_wise_mask(gather_inp, padding_idx, weight, world_size, rank):
        padding_idx: If specified, the entries at padding_idx do
            not contribute to the gradient; therefore, the embedding
            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed "pad".
+            i.e. it remains as a fixed “pad”.
            Note that the embedding vector at padding_idx is
            excluded from the reduction.
        weight: weight tensor of Embedding look-up table.
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
@ -202,7 +202,7 @@ def _handle_col_wise_sharding(
        padding_idx: If specified, the entries at padding_idx do
            not contribute to the gradient; therefore, the embedding
            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed "pad".
+            i.e. it remains as a fixed “pad”.
        pg: process group.

    Returns: final result of lookup.
@ -250,7 +250,7 @@ def _handle_row_wise_sharding(
        padding_idx: If specified, the entries at padding_idx do
            not contribute to the gradient; therefore, the embedding
            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed "pad".
+            i.e. it remains as a fixed “pad”.
        rank: # of cuda process.
        pg: process group.

--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@ -268,7 +268,7 @@ def _handle_col_wise_sharding(
        padding_idx: If specified, the entries at padding_idx do
            not contribute to the gradient; therefore, the embedding
            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed "pad".
+            i.e. it remains as a fixed “pad”.
            Note that the embedding vector at padding_idx is
            excluded from the reduction.
        pg: process group.
@ -342,7 +342,7 @@ def _handle_row_wise_sharding(
        padding_idx: If specified, the entries at padding_idx do
            not contribute to the gradient; therefore, the embedding
            vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed "pad".
+            i.e. it remains as a fixed “pad”.
            Note that the embedding vector at padding_idx is
            excluded from the reduction.
        rank: # of cuda process.
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@ -124,7 +124,7 @@ class EtcdRendezvousHandler(RendezvousHandler):
    |                                            | (default 600s)           |
    +--------------------------------------------+--------------------------+
    | last_call_timeout                          | additional wait amount   |
-    |                                            | ("last call") after min  |
+    |                                            | (“last call”) after min  |
    |                                            | number of workers has    |
    |                                            | been reached (defaults   |
    |                                            | to 30s)                  |
--- a/torch/distributed/pipeline/sync/_balance/blockpartition.py
+++ b/torch/distributed/pipeline/sync/_balance/blockpartition.py
@ -4,7 +4,7 @@
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
-"""Implements "Block Partitions of Sequences" by Imre B\u00e1r\u00e1ny et al.
+"""Implements "Block Partitions of Sequences" by Imre Bárány et al.

 Paper: https://arxiv.org/pdf/1308.2452.pdf

@ -18,7 +18,7 @@ def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:
    """Splits a sequence into several partitions to minimize variance for each
    partition.

-    The result might not be optimal. However, it can be done only in O(kn\u00b3),
+    The result might not be optimal. However, it can be done only in O(kn³),
    where k is the number of partitions and n is the length of the sequence.

    """
@ -51,14 +51,14 @@ def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:

    while True:
        """
-        (1) Fix p element-of [k] with M(P) = bp. So Bp is a maximal block of P.
+        (1) Fix p ∈ [k] with M(P) = bp. So Bp is a maximal block of P.
        """
        # max_size: M(P)
        max_size, p = max(leaderboard())

        while True:
            """
-            (2) If M(P) <= m(P) + 1, then stop.
+            (2) If M(P) ≤ m(P) + 1, then stop.
            """
            # min_size: m(P)
            min_size, q = min(leaderboard())
@ -67,7 +67,7 @@ def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:
                return [sequence[i:j] for i, j in zip([0] + splits[:-1], splits)]

            """
-            (3) If M(P) > m(P) + 1, then let m(P) = bq for the q element-of [k] which is
+            (3) If M(P) > m(P) + 1, then let m(P) = bq for the q ∈ [k] which is
            closest to p (ties broken arbitrarily). Thus Bq is a minimal block
            of P. Let Bh be the block next to Bq between Bp and Bq. (Note that
            Bh is a non-empty block: if it were, then m(P) = 0 and we should
@ -75,21 +75,21 @@ def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:
            """
            if p < q:
                """
-                So either p < q and then h = q-1 and we define P * by moving
-                the last element from Bh = Bq-1 to Bq,
+                So either p < q and then h = q−1 and we define P ∗ by moving
+                the last element from Bh = Bq−1 to Bq,
                """
                h = q - 1
                splits[h] -= 1
            else:
                """
-                or q < p, and then h = q + 1 and P * is obtained by moving the
+                or q < p, and then h = q + 1 and P ∗ is obtained by moving the
                first element of Bh = Bq+1 to Bq.
                """
                h = q + 1
                splits[q] += 1

            """
-            Set P = P * . If p = h, then go to (1), else go to (2).
+            Set P = P ∗ . If p = h, then go to (1), else go to (2).
            """
            if p == h:
                break
--- a/torch/distributed/pipeline/sync/pipeline.py
+++ b/torch/distributed/pipeline/sync/pipeline.py
@ -157,30 +157,30 @@ class Pipeline:
        exc_info: Optional[ExcInfo] = None

        # With checkpointing, the autograd graph looks like this diagram:
-        # +-----+------+
-        # |    Copy    |
-        # +-----+------+   (fence)
-        # - - - + - - - - - - - - -
-        #       |          (compute)
-        # +-----+------+
-        # |    Wait    | [1] Synchronize the current stream with the copy stream.
-        # +-----+------+
-        # +-----+------+
-        # | Checkpoint | [2] Compute a partition within checkpointing.
-        # +-----+------+
-        # +-----+------+
-        # |    Wait    | [3] Synchronize the copy stream with the current stream.
-        # +-----+------+
-        #       + - - - +
-        #       | +-----+-----+
-        #       | | Recompute | [4] Schedule the recomputation at backpropagation.
-        #       | +-----+-----+
-        #       + - - - +
-        #       |
-        # - - - + - - - - - - - - -
-        # +-----+------+   (fence)
-        # |    Copy    |
-        # +-----+------+
+        # ┌─────┸──────┐
+        # │    Copy    │
+        # └─────┰──────┘   (fence)
+        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
+        #       ┃          (compute)
+        # ┌─────┸──────┐
+        # │    Wait    │ [1] Synchronize the current stream with the copy stream.
+        # └─────┰──────┘
+        # ┌─────┸──────┐
+        # │ Checkpoint │ [2] Compute a partition within checkpointing.
+        # └─────┰──────┘
+        # ┌─────┸──────┐
+        # │    Wait    │ [3] Synchronize the copy stream with the current stream.
+        # └─────┰──────┘
+        #       ┠ ─ ─ ─ ┐
+        #       ┃ ┌─────┴─────┐
+        #       ┃ │ Recompute │ [4] Schedule the recomputation at backpropagation.
+        #       ┃ └─────┬─────┘
+        #       ┠ ─ ─ ─ ┘
+        #       ┃
+        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
+        # ┌─────┸──────┐   (fence)
+        # │    Copy    │
+        # └─────┰──────┘
        for i, j in schedule:
            batch = batches[i]
            partition = partitions[j]
--- a/torch/distributed/pipeline/sync/skip/portal.py
+++ b/torch/distributed/pipeline/sync/skip/portal.py
@ -9,7 +9,7 @@ autograd engine. The shared context of three functions (:class:`PortalBlue`,
 :class:`PortalOrange`, and :class:`PortalCopy`) out of the computation graph is
 one of the most important feature of :mod:`torchpipe.skip`.

-The metaphor is inspired by Portal(tm) from Valve.
+The metaphor is inspired by Portal™ from Valve.

 """
 from typing import List, Optional, Tuple
--- a/torch/distributed/pipeline/sync/skip/skippable.py
+++ b/torch/distributed/pipeline/sync/skip/skippable.py
@ -362,16 +362,16 @@ def verify_skippables(module: nn.Sequential) -> None:
        # Layer3 pops "1to3".

        nn.Sequential(Layer1(), Layer2())
-        #               +---- ?
+        #               └──── ?

        nn.Sequential(Layer2(), Layer3())
-        #                   ? ----+
+        #                   ? ────┘

        nn.Sequential(Layer1(), Layer2(), Layer3(), Layer3())
-        #               +-------------------+       ^^^^^^
+        #               └───────────────────┘       ^^^^^^

        nn.Sequential(Layer1(), Layer1(), Layer2(), Layer3())
-        #             ^^^^^^      +-------------------+
+        #             ^^^^^^      └───────────────────┘

    To use the same name for multiple skip tensors, they must be isolated by
    different namespaces. See :meth:`isolate()
--- a/torch/fx/experimental/migrate_gradual_types/constraint.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint.py
@ -152,7 +152,7 @@ class TGreatestUpperBound(Constraint):
        self.rhs2 = rhs2

    def __repr__(self):
-        return f'{self.res} = {self.rhs1}\u2294*{self.rhs2}'
+        return f'{self.res} = {self.rhs1}⊔*{self.rhs2}'

    def __eq__(self, other):
        if isinstance(other, TGreatestUpperBound):
@ -180,7 +180,7 @@ class DGreatestUpperBound(Constraint):
        self.rhs2 = rhs2

    def __repr__(self):
-        return f'{self.res} = {self.rhs1}\u2294{self.rhs2}'
+        return f'{self.res} = {self.rhs1}⊔{self.rhs2}'

    def __eq__(self, other):
        if isinstance(other, DGreatestUpperBound):
--- a/torch/fx/experimental/migrate_gradual_types/operation.py
+++ b/torch/fx/experimental/migrate_gradual_types/operation.py
@ -5,10 +5,10 @@ op_div = '/'
 op_eq = '='
 op_neq = '!='
 op_imp = '=>'
-op_matching = '\u22b3'  # (contains)
+op_matching = '⊳'
 op_consistency = '~'
-op_precision = '\u2291'  # (square image of or equal to)
-op_leq = '\u2264'  # less-than or equal to
+op_precision = '⊑'
+op_leq = '≤'
 op_lt = '<'
 op_gt = '>'
 op_mod = '%'
--- a/torch/linalg/init.py
+++ b/torch/linalg/init.py
@ -1450,7 +1450,7 @@ Keyword args:
    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
    dtype (:class:`torch.dtype`, optional): type used to perform the accumulation and the return.
        If specified, :attr:`x` is cast to :attr:`dtype` before performing the operation,
-        and the returned tensor's type will be :attr:`dtype` if real and of its real counterpart if complex.
+        and the returned tensor’s type will be :attr:`dtype` if real and of its real counterpart if complex.
        :attr:`dtype` may be complex if :attr:`x` is complex, otherwise it must be real.
        :attr:`x` should be convertible without narrowing to :attr:`dtype`. Default: None

--- a/torch/masked/_docs.py
+++ b/torch/masked/_docs.py
@ -1012,7 +1012,7 @@ Args:
    input (Tensor): the input tensor
    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
      Default: None that is equivalent to ``tuple(range(input.ndim))``.
-    unbiased (bool): when True, use Bessel's correction, otherwise, compute
+    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
      the uncorrected sample variance.

 Keyword args:
@ -1148,7 +1148,7 @@ Args:
    input (Tensor): the input tensor
    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
      Default: None that is equivalent to ``tuple(range(input.ndim))``.
-    unbiased (bool): when True, use Bessel's correction, otherwise, compute
+    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
      the uncorrected sample variance.

 Keyword args:
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@ -210,7 +210,7 @@ ord (int, float, optional): the order of vector norm. Default: 2.
 ord (int, float): the order of vector norm. Default: 2.
  See :func:`torch.linalg.vector_norm` for a list of supported norms.""",
        unbiased="""\
-unbiased (bool): when True, use Bessel's correction, otherwise, compute
+unbiased (bool): when True, use Bessel’s correction, otherwise, compute
  the uncorrected sample variance.""",
        eps="""\
 eps (float, optional): small value to avoid division by zero. Default: {default}.""",
--- a/torch/nested/init.py
+++ b/torch/nested/init.py
@ -186,7 +186,7 @@ Example::

 def nested_tensor(tensor_list, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor:
    r"""
-Constructs a nested tensor with no autograd history (also known as a "leaf tensor", see
+Constructs a nested tensor with no autograd history (also known as a “leaf tensor”, see
 :ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.

 Args:
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@ -20,7 +20,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):

    As described in
    `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
-    Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou
+    Moustapha Cissé, David Grangier, and Hervé Jégou
    <https://arxiv.org/abs/1609.04309>`__.

    Adaptive softmax is an approximate strategy for training models with large
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@ -204,7 +204,7 @@ class Conv1d(_ConvNd):
      amount of implicit padding applied on both sides.

    * :attr:`dilation` controls the spacing between the kernel points; also
-      known as the \uue0 trous algorithm. It is harder to describe, but this `link`_
+      known as the à trous algorithm. It is harder to describe, but this `link`_
      has a nice visualization of what :attr:`dilation` does.

    {groups_note}
@ -341,7 +341,7 @@ class Conv2d(_ConvNd):
      amount of implicit padding applied on both sides.

    * :attr:`dilation` controls the spacing between the kernel points; also
-      known as the \u00e0 trous algorithm. It is harder to describe, but this `link`_
+      known as the à trous algorithm. It is harder to describe, but this `link`_
      has a nice visualization of what :attr:`dilation` does.

    {groups_note}
@ -483,7 +483,7 @@ class Conv3d(_ConvNd):
      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
      amount of implicit padding applied on both sides.

-    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.

    {groups_note}
@ -690,7 +690,7 @@ class ConvTranspose1d(_ConvTransposeNd):
    * :attr:`output_padding` controls the additional size added to one side
      of the output shape. See note below for details.

-    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.

    {groups_note}
@ -821,7 +821,7 @@ class ConvTranspose2d(_ConvTransposeNd):
    * :attr:`output_padding` controls the additional size added to one side
      of the output shape. See note below for details.

-    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.

    {groups_note}
@ -978,7 +978,7 @@ class ConvTranspose3d(_ConvTransposeNd):
    * :attr:`output_padding` controls the additional size added to one side
      of the output shape. See note below for details.

-    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.

    {groups_note}
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@ -41,7 +41,7 @@ class Fold(Module):
      sides for :attr:`padding` number of points for each dimension before
      reshaping.

-    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.

    Args:
@ -186,7 +186,7 @@ class Unfold(Module):
      sides for :attr:`padding` number of points for each dimension before
      reshaping.

-    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.

    Args:
--- a/torch/onnx/_internal/diagnostics/infra/_infra.py
+++ b/torch/onnx/_internal/diagnostics/infra/_infra.py
@ -49,7 +49,7 @@ class Tag(enum.Enum):
 class PatchedPropertyBag(sarif.PropertyBag):
    """Key/value pairs that provide additional information about the object.

-    The definition of PropertyBag via SARIF spec is "A property bag is an object (section 3.6)
+    The definition of PropertyBag via SARIF spec is "A property bag is an object (§3.6)
    containing an unordered set of properties with arbitrary names." However it is not
    reflected in the json file, and therefore not captured by the python representation.
    This patch adds additional **kwargs to the `__init__` method to allow recording
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@ -26,13 +26,13 @@ def export_as_test_case(
    is as follows:

    dir
-    \u251c\u2500\u2500 test_<name>
-    \u2502   \u251c\u2500\u2500 model.onnx
-    \u2502   \u2514\u2500\u2500 test_data_set_0
-    \u2502       \u251c\u2500\u2500 input_0.pb
-    \u2502       \u251c\u2500\u2500 input_1.pb
-    \u2502       \u251c\u2500\u2500 output_0.pb
-    \u2502       \u2514\u2500\u2500 output_1.pb
+    ├── test_<name>
+    │   ├── model.onnx
+    │   └── test_data_set_0
+    │       ├── input_0.pb
+    │       ├── input_1.pb
+    │       ├── output_0.pb
+    │       └── output_1.pb

    Args:
        model_bytes: The ONNX model in bytes.
@ -80,13 +80,13 @@ def load_test_case(dir: str) -> Tuple[bytes, Any, Any]:
    should be as follows:

    dir
-    \u251c\u2500\u2500 test_<name>
-    \u2502   \u251c\u2500\u2500 model.onnx
-    \u2502   \u2514\u2500\u2500 test_data_set_0
-    \u2502       \u251c\u2500\u2500 input_0.pb
-    \u2502       \u251c\u2500\u2500 input_1.pb
-    \u2502       \u251c\u2500\u2500 output_0.pb
-    \u2502       \u2514\u2500\u2500 output_1.pb
+    ├── test_<name>
+    │   ├── model.onnx
+    │   └── test_data_set_0
+    │       ├── input_0.pb
+    │       ├── input_1.pb
+    │       ├── output_0.pb
+    │       └── output_1.pb

    Args:
        dir: The directory containing the test case.
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@ -785,7 +785,7 @@ def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
    )

    # For None values of posinf, neginf we use the greatest/lowest finite
-    # value representable by input's dtype.
+    # value representable by input’s dtype.
    finfo = torch.finfo(input_dtype)
    if posinf is None:
        posinf = finfo.max
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@ -1379,10 +1379,10 @@ def normal(
    pin_memory=None,
 ):
    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
-    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
+    # scale-location transformation of that distribution, which has mean μ and variance σ's square. If x is a sample
    # from a mean 0 and variance 1 distribution then
-    #       sigma x+mu
-    # is a sample with mean mu and variance sigma's square.
+    #       σx+μ
+    # is a sample with mean μ and variance σ's square.
    if sizes is not None and not symbolic_helper._is_none(sizes):
        mean = opset9.expand(g, mean, sizes, None)
    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@ -1020,7 +1020,7 @@ class GraphInfoPrettyPrinter:
            else ""
        )

-        return f"{node_count} {'X' if has_mismatch else chr(0x2713)} {error_node_kind}"
+        return f"{node_count} {'X' if has_mismatch else '✓'} {error_node_kind}"

    @_beartype.beartype
    def _graph_id_segment_str(self) -> str:
@ -1148,13 +1148,13 @@ class OnnxTestCaseRepro:
        structure is as follows:

        dir
-        \u251c\u2500\u2500 test_<name>
-        \u2502   \u251c\u2500\u2500 model.onnx
-        \u2502   \u2514\u2500\u2500 test_data_set_0
-        \u2502       \u251c\u2500\u2500 input_0.pb
-        \u2502       \u251c\u2500\u2500 input_1.pb
-        \u2502       \u251c\u2500\u2500 output_0.pb
-        \u2502       \u2514\u2500\u2500 output_1.pb
+        ├── test_<name>
+        │   ├── model.onnx
+        │   └── test_data_set_0
+        │       ├── input_0.pb
+        │       ├── input_1.pb
+        │       ├── output_0.pb
+        │       └── output_1.pb

        Args:
            proto: ONNX model proto.
@ -1244,19 +1244,19 @@ class GraphInfo:
        Example::

            ==================================== Tree: =====================================
-            5 X   __2 X    __1 \u2713
+            5 X   __2 X    __1 ✓
            id:  |  id: 0 |  id: 00
                 |        |
                 |        |__1 X (aten::relu)
                 |           id: 01
                 |
-                 |__3 X    __1 \u2713
+                 |__3 X    __1 ✓
                    id: 1 |  id: 10
                          |
                          |__2 X     __1 X (aten::relu)
                             id: 11 |  id: 110
                                    |
-                                    |__1 \u2713
+                                    |__1 ✓
                                       id: 111
            =========================== Mismatch leaf subgraphs: ===========================
            ['01', '110']
@ -1354,13 +1354,13 @@ class GraphInfo:
        The repro directory will contain the following files::

            dir
-            \u251c\u2500\u2500 test_<name>
-            \u2502   \u251c\u2500\u2500 model.onnx
-            \u2502   \u2514\u2500\u2500 test_data_set_0
-            \u2502       \u251c\u2500\u2500 input_0.pb
-            \u2502       \u251c\u2500\u2500 input_1.pb
-            \u2502       \u251c\u2500\u2500 output_0.pb
-            \u2502       \u2514\u2500\u2500 output_1.pb
+            ├── test_<name>
+            │   ├── model.onnx
+            │   └── test_data_set_0
+            │       ├── input_0.pb
+            │       ├── input_1.pb
+            │       ├── output_0.pb
+            │       └── output_1.pb

        Args:
            repro_dir: The directory to export the repro files to. Defaults to current
@ -1825,19 +1825,19 @@ def find_mismatch(
        Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
        Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
        ==================================== Tree: =====================================
-        5 X   __2 X    __1 \u2713
+        5 X   __2 X    __1 ✓
        id:  |  id: 0 |  id: 00
             |        |
             |        |__1 X (aten::relu)
             |           id: 01
             |
-             |__3 X    __1 \u2713
+             |__3 X    __1 ✓
                id: 1 |  id: 10
                      |
                      |__2 X     __1 X (aten::relu)
                         id: 11 |  id: 110
                                |
-                                |__1 \u2713
+                                |__1 ✓
                                   id: 111
        =========================== Mismatch leaf subgraphs: ===========================
        ['01', '110']
--- a/torch/package/file_structure_representation.py
+++ b/torch/package/file_structure_representation.py
@ -67,16 +67,13 @@ class Directory:
        return "".join(str_list)

    def _stringify_tree(
-        self,
-        str_list: List[str],
-        preamble: str = "",
-        dir_ptr: str = "\u2500\u2500\u2500 ",
+        self, str_list: List[str], preamble: str = "", dir_ptr: str = "─── "
    ):
        """Recursive method to generate print-friendly version of a Directory."""
        space = "    "
-        branch = "\u2502   "
-        tee = "\u251c\u2500\u2500 "
-        last = "\u2514\u2500\u2500 "
+        branch = "│   "
+        tee = "├── "
+        last = "└── "

        # add this directory's representation
        str_list.append(f"{preamble}{dir_ptr}{self.name}\n")
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@ -748,7 +748,7 @@ Computes the minimum 4-term Blackman-Harris window according to Nuttall.
 .. math::
    w_n = 1 - 0.36358 \cos{(z_n)} + 0.48917 \cos{(2z_n)} - 0.13659 \cos{(3z_n)} + 0.01064 \cos{(4z_n)}

-where ``z_n = 2 \u03c0 n/ M``.
+where ``z_n = 2 π n/ M``.
    """,
    """

@ -766,12 +766,12 @@ Keyword args:

 References::

-    - A. Nuttall, "Some windows with very good sidelobe behavior,"
+    - A. Nuttall, “Some windows with very good sidelobe behavior,”
      IEEE Transactions on Acoustics, Speech, and Signal Processing, vol. 29, no. 1, pp. 84-91,
      Feb 1981. https://doi.org/10.1109/TASSP.1981.1163506

-    - Heinzel G. et al., "Spectrum and spectral density estimation by the Discrete Fourier transform (DFT),
-      including a comprehensive list of window functions and some new flat-top windows",
+    - Heinzel G. et al., “Spectrum and spectral density estimation by the Discrete Fourier transform (DFT),
+      including a comprehensive list of window functions and some new flat-top windows”,
      February 15, 2002 https://holometer.fnal.gov/GH_FFT.pdf

 Examples::
--- a/torch/special/init.py
+++ b/torch/special/init.py
@ -1036,7 +1036,7 @@ hermite_polynomial_h = _add_docstr(_special.special_hermite_polynomial_h,
                                   r"""
 hermite_polynomial_h(input, n, *, out=None) -> Tensor

-Physicist's Hermite polynomial :math:`H_{n}(\text{input})`.
+Physicist’s Hermite polynomial :math:`H_{n}(\text{input})`.

 If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
 is returned. Otherwise, the recursion:
@ -1059,7 +1059,7 @@ hermite_polynomial_he = _add_docstr(_special.special_hermite_polynomial_he,
                                    r"""
 hermite_polynomial_he(input, n, *, out=None) -> Tensor

-Probabilist's Hermite polynomial :math:`He_{n}(\text{input})`.
+Probabilist’s Hermite polynomial :math:`He_{n}(\text{input})`.

 If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
 is returned. Otherwise, the recursion:
--- a/torch/utils/jit/init.py
+++ b/torch/utils/jit/init.py
@ -0,0 +1 @@
+