[BE][13/16] fix typos in torch/ (torch/ao/) (#156603)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156603 Approved by: https://github.com/msaroufim
2025-12-06 12:20:52 +01:00 · 2025-06-29 11:38:22 +08:00 · 2025-06-29 11:38:22 +08:00 · f8293116f5
commit f8293116f5
parent 1913c915e0
33 changed files with 66 additions and 66 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1173,7 +1173,6 @@ exclude_patterns = [
    'test/distributed/**',
    'torch/**',
    'torch/_*/**',
-    'torch/ao/**',
    'torch/fx/**',
    'torch/distributed/tensor/**',
    'torch/[j-o]*/**',
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@ -18,6 +18,7 @@ NowNs
 optins
 OT
 overrideable
+padD
 ptd
 rebuild
 rebuilt
--- a/torch/ao/nn/quantizable/modules/activation.py
+++ b/torch/ao/nn/quantizable/modules/activation.py
@ -214,7 +214,7 @@ class MultiheadAttention(nn.MultiheadAttention):
            fp.bias_v = nn.Parameter(self.bias_v.dequantize())

        # Set the linear weights
-        # Note: Because the linear layers are quantized, mypy does not nkow how
+        # Note: Because the linear layers are quantized, mypy does not know how
        # to deal with them -- might need to ignore the typing checks.
        # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969
        w, b = self.out_proj._weight_bias()  # type: ignore[operator, has-type]
--- a/torch/ao/nn/sparse/quantized/utils.py
+++ b/torch/ao/nn/sparse/quantized/utils.py
@ -15,7 +15,7 @@ def _is_valid_linear_block_sparse_pattern(

 # This is a stop-gap measure as current flow does not allow module
 # specific block sparse pattern.
-# Infact there is no way to convey sparse pattern via module config
+# In fact there is no way to convey sparse pattern via module config
 # of quantization flow. Thus using the global context to convey
 # sparsity pattern.
 # Once the flow supports it, this should be removed.
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@ -1124,7 +1124,7 @@ def create_a_shadows_b(
                # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c
                #
                # Note: node_start_c may be the same node as node_end_c, or they
-                # may have nodes inbetween.
+                # may have nodes in between.

        else:
            env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@ -109,7 +109,7 @@ class QConfigMultiMapping:
                        target_qconfigs_dict[key] = None
                break

-            # insert copies of this new QConfigMapping until all entires
+            # insert copies of this new QConfigMapping until all entries
            # in qconfig_list can fit among the QConfigMappings
            while len(qconfig_list) > len(self.qconfig_mappings_list):
                self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping))
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@ -159,7 +159,7 @@ class ActivationSparsifier:
                if data is None:
                    out_data = [
                        0 for _ in range(0, len(features))
-                    ]  # create one incase of 1st forward
+                    ]  # create one in case of 1st forward
                    self.state[name]["mask"] = [0 for _ in range(0, len(features))]
                else:
                    out_data = data  # a list
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
@ -14,7 +14,7 @@ The [DataNormSparsifier](https://github.com/pytorch/pytorch/blob/main/torch/ao/p
 3. Norm: L1 and L2

 ## Dataset
-The benchmarks are created for the dlrm model on the Kaggle CriteoDataset which can be downloaded from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1).
+The benchmarks are created for the dlrm model on the Kaggle CriteoDataset which can be downloaded from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1). <!-- codespell:ignore -->

 ## Results
 1. **Disk Usage**: Introducing sparsity in the embeddings reduces file size after compression. The compressed model size goes down from 1.9 GB to 150 MB after 100% sparsity.
@ -34,7 +34,7 @@ The takeaway is that the dlrm model with sparse coo tensor is slower (roughly 2x
 ## Setup
 The benchmark codes depend on the [DLRM codebase](https://github.com/facebookresearch/dlrm).
 1. Clone the dlrm git repository
-2. Download the dataset from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1)
+2. Download the dataset from [here](https://ailab.criteo.com/ressources/) or [here](https://figshare.com/articles/dataset/Kaggle_Display_Advertising_Challenge_dataset/5732310/1) <!-- codespell:ignore -->
 3. The DLRM model can be trained using the following script
 ```
 # Make sure you go into the file and make sure that the path to dataset is correct.
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@ -199,7 +199,7 @@ class TestTrainingAwareCallback(TestCase):
                   do not want as the config of each layer changes after
                   .step()

-        Hence, we need to dump and restore the state_dict() everytime because we're
+        Hence, we need to dump and restore the state_dict() every time because we're
        copying the model after each epoch.
        Hence, it is essential to make sure that the sparsifier's state_dict() is being
        correctly dumped and restored.
--- a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
@ -11,7 +11,7 @@ __all__ = ["FPGMPruner"]

 class FPGMPruner(BaseStructuredSparsifier):
    r"""Filter Pruning via Geometric Median (FPGM) Structured Pruner
-    This sparsifier prune fliter (row) in a tensor according to distances among filters according to
+    This sparsifier prune filter (row) in a tensor according to distances among filters according to
    `Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`_.

    This sparsifier is controlled by three variables:
--- a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
@ -7,7 +7,7 @@ class SaliencyPruner(BaseStructuredSparsifier):
    Prune rows based on the saliency (L1 norm) of each row.

    This pruner works on N-Dimensional weight tensors.
-    For each row, we will calculate the saliency, whic is the sum the L1 norm of all weights in that row.
+    For each row, we will calculate the saliency, which is the sum the L1 norm of all weights in that row.
    We expect that the resulting saliency vector has the same shape as our mask.
    We then pick elements to remove until we reach the target sparsity_level.
    """
--- a/torch/ao/quantization/experimental/adaround_loss.py
+++ b/torch/ao/quantization/experimental/adaround_loss.py
@ -54,7 +54,7 @@ class AdaptiveRoundingLoss(torch.nn.Module):
                1 + np.cos(rel_iter * np.pi)
            )

-            # A rectified sigmoid for soft-quantization as formualted [23] in https://arxiv.org/pdf/2004.10568.pdf
+            # A rectified sigmoid for soft-quantization as formulated [23] in https://arxiv.org/pdf/2004.10568.pdf
            h_alpha = torch.clamp(
                torch.sigmoid(V) * (ADAROUND_ZETA - ADAROUND_GAMMA) + ADAROUND_GAMMA,
                min=0,
--- a/torch/ao/quantization/experimental/adaround_optimization.py
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@ -107,7 +107,7 @@ class AdaptiveRoundingOptimizer:
        )
        if torch.cuda.is_available():
            # Somehow, we need to move the model continuously
-            # Otherwise, the model will be lowered to CPU misteriously
+            # Otherwise, the model will be lowered to CPU mysteriously
            self.model = self.model.cuda()
            self.q_model = self.q_model.cuda()
        for data_ in data:
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@ -296,7 +296,7 @@ BackendConfig(nniqat.LinearReLU)

 Pattern in this case is the same as before, it defines the pattern for the subgraph we are dealing with

-`set_observation_type`: sets the observation type for the patter, currently only two types:
+`set_observation_type`: sets the observation type for the pattern, currently only two types:

 `OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT` means the output observer instance will be different from the input, which is the most common type of observer placement.

--- a/torch/ao/quantization/fx/_model_report/README.md
+++ b/torch/ao/quantization/fx/_model_report/README.md
@ -8,10 +8,10 @@ ModelReport
 Most detectors require a **traceable GraphModule**, but some (ex. `PerChannelDetector`) require just an `nn.Module`.

 #### Typical Fx Workflow
- Initialize model &rarr; Prepare model &rarr; Callibrate model &rarr; Convert model &rarr; ...
+- Initialize model &rarr; Prepare model &rarr; Calibrate model &rarr; Convert model &rarr; ...

 #### Fx Workflow with ModelReport
- Initialize model &rarr; Prepare model &rarr; **Add detector observers** &rarr; Callibrate model &rarr; **Generate report** &rarr; **Remove detector observers** &rarr; Convert model &rarr; ...
+- Initialize model &rarr; Prepare model &rarr; **Add detector observers** &rarr; Calibrate model &rarr; **Generate report** &rarr; **Remove detector observers** &rarr; Convert model &rarr; ...

 > ⚠️ **You can only prepare and remove observers once with a given ModelReport Instance**: Be very careful here!

@ -23,7 +23,7 @@ This snippet should be ready to copy, paste, and use with the exception of a few
 # prep model
 qconfig_mapping = torch.ao.quantization.get_default_qconfig_mapping()
 model = Model() # TODO define model
-example_input = torch.randn((*args)) # TODO get example data for callibration
+example_input = torch.randn((*args)) # TODO get example data for calibration
 prepared_model = quantize_fx.prepare_fx(model, qconfig_mapping, example_input)

 # create ModelReport instance and insert observers
@ -31,8 +31,8 @@ detector_set = set([DynamicStaticDetector()]) # TODO add all desired detectors
 model_report = ModelReport(model, detector_set)
 ready_for_callibrate = model_report.prepare_detailed_callibration()

-# callibrate model and generate report
-ready_for_callibrate(example_input) # TODO run callibration of model with relevant data
+# calibrate model and generate report
+ready_for_callibrate(example_input) # TODO run calibration of model with relevant data
 reports = model_report.generate_model_report(remove_inserted_observers=True)
 for report_name in report.keys():
    text_report, report_dict = reports[report_name]
@ -46,7 +46,7 @@ mod_rep_visualizer.generate_table_visualization() # shows collected data as a ta
 ```

 There is a tutorial in the works that will walk through a full usage of the ModelReport API.
-This tutorial will show the ModelReport API being used on toy model in both an Fx Graph Mode workflow and an alterative workflow with just a traceable model.
+This tutorial will show the ModelReport API being used on toy model in both an Fx Graph Mode workflow and an alternative workflow with just a traceable model.
 This README will be updated with a link to the tutorial upon completion of the tutorial.

 # Key Modules Overview
@ -60,7 +60,7 @@ There are three primary methods to be familiar with when using the ModelReport c
 This is so that we can keep track of where we want to insert observers on a detector by detector basis and also keep track of which detectors to generate reports for.
 - `prepare_detailed_calibration(self)` &rarr; `GraphModule` inserts observers into the locations specified by each detector in the model.
 It then returns the GraphModule with the detectors inserted into both the regular module structure as well as the node structure.
- `generate_model_report(self, remove_inserted_observers: bool)` &rarr; `Dict[str, Tuple[str, Dict]]` uses callibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with:
+- `generate_model_report(self, remove_inserted_observers: bool)` &rarr; `Dict[str, Tuple[str, Dict]]` uses calibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with:
  - A string-based report that is easily digestable and actionable explaining the data collected by relevant observers for that detector
  - A dictionary containing statistics collected by the relevant observers and values calculated by the detector for further analysis or plotting

@ -107,7 +107,7 @@ For both of the two things listed above, you can filter the data by either `modu
 To get a list of all the modules or features, you can call `mod_rep_visualizer.get_all_unique_module_fqns()`
 and `mod_rep_visualizer.get_all_unique_feature_names()` respectively.
 For the features, because some features are not plottable, you can set the flag to only get plottable features
-in the aformentioned `get_all_unique_feature_names` method.
+in the aforementioned `get_all_unique_feature_names` method.

 ## Detector Overview

@ -152,7 +152,7 @@ The statistics collected by the `ModelReportObserver` include:
 - Ratio of 100th percentile to some *n*th percentile
 - Number of constant value batches to pass through each channel

-After the `ModelReportObserver` collects the statistics above during the callibration process, the detectors then extract the information they need to generate their reports from the relevant observers.
+After the `ModelReportObserver` collects the statistics above during the calibration process, the detectors then extract the information they need to generate their reports from the relevant observers.

 ### Using Your Own Observer

--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@ -36,7 +36,7 @@ class ModelReport:
    - Suggestions for outlier detection for all layers (Graph Modules)

    The ModelReport class has the primary functionality of inserting observers (primarily the ModelReportObserver)
-    where needed for each detector to gather the information it needs, and then after callibration, the ModelReport
+    where needed for each detector to gather the information it needs, and then after calibration, the ModelReport
    class compiles the report generated by each Detector class into a single report to return to the user. It also
    has the capability to remove all the observers it inserted as well.

@ -70,7 +70,7 @@ class ModelReport:
    1.) Initialize ModelReport object with reports of interest by passing in initialized detector objects and model
    2.) Prepare your model with prepare_fx
    3.) Call model_report.prepare_detailed_calibration to add relevant observers
-    4.) Callibrate your model with data
+    4.) Calibrate your model with data
    5.) Call model_report.generate_report on your model to generate report and optionally remove added observers
    Optional
        6.) Call model_report.generate_visualizer to get a ModelReportVisualizer instance
@ -102,7 +102,7 @@ class ModelReport:
        ... )
        >>> tracer_reporter = ModelReport(graph_module, tracer_detector_set)

-        >>> # now we insert the observers and callibrate the model
+        >>> # now we insert the observers and calibrate the model
        >>> tracer_model_with_observers = tracer_reporter.prepare_detailed_calibration()
        >>> for i in range(num_callibration_batches):
        >>>     example_input = get_callibration_input()
@ -179,7 +179,7 @@ class ModelReport:
        # if already prepared once, cannot prepare again
        if self._prepared_flag:
            raise ValueError(
-                "Already ran preparing detailed callibration. Run the report generation next after callibration."
+                "Already ran preparing detailed calibration. Run the report generation next after calibration."
            )

        # loop through each detector, find where placements should be, and keep track
@ -271,7 +271,7 @@ class ModelReport:
        Generates all the requested reports.

        Note:
-            You should have callibrated the model with relevant data before calling this
+            You should have calibrated the model with relevant data before calling this

        The reports generated are specified by the desired_reports specified in desired_reports

@ -286,12 +286,12 @@ class ModelReport:

        Note:
            Throws exception if we try to generate report on model we already removed observers from
-            Throws exception if we try to generate report without preparing for callibration
+            Throws exception if we try to generate report without preparing for calibration
        """
-        # if we haven't prepped model for callibration, then we shouldn't generate report yet
+        # if we haven't prepped model for calibration, then we shouldn't generate report yet
        if not self._prepared_flag:
            raise Exception(  # noqa: TRY002
-                "Cannot generate report without preparing model for callibration"
+                "Cannot generate report without preparing model for calibration"
            )

        # if we already removed the observers, we cannot generate report
@ -546,12 +546,12 @@ class ModelReport:

        Note:
            Throws exception if we try to generate mapping on model we already removed observers from
-            Throws exception if we try to generate mapping without preparing for callibration
+            Throws exception if we try to generate mapping without preparing for calibration
        """
-        # if we haven't prepped model for callibration, then we shouldn't generate mapping yet
+        # if we haven't prepped model for calibration, then we shouldn't generate mapping yet
        if not self._prepared_flag:
            raise Exception(  # noqa: TRY002
-                "Cannot generate report without preparing model for callibration"
+                "Cannot generate report without preparing model for calibration"
            )

        # if we already removed the observers, we cannot mapping
@ -600,7 +600,7 @@ class ModelReport:

        Note:
            Throws exception if we try to generate mapping on model we already removed observers from
-            Throws exception if we try to generate mapping without preparing for callibration
+            Throws exception if we try to generate mapping without preparing for calibration
        """
        # get the mapping info
        detector_qconfig_info_combined = (
--- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@ -63,7 +63,7 @@ class ModelReportVisualizer:
    1.) Initialize ModelReport object with reports of interest by passing in initialized detector objects
    2.) Prepare your model with prepare_fx
    3.) Call model_report.prepare_detailed_calibration on your model to add relevant observers
-    4.) Callibrate your model with data
+    4.) Calibrate your model with data
    5.) Call model_report.generate_report on your model to generate report and optionally remove added observers
    6.) Use output of model_report.generate_report to initialize ModelReportVisualizer instance
    7.) Use instance to view different views of data as desired, applying filters as needed
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@ -1107,7 +1107,7 @@ def _maybe_insert_output_observer_for_node(
        )
    target_dtype, target_is_dynamic = _get_dtype_and_is_dynamic(output_act_obs_or_fq)
    # uncomment after we support reuse_input_obs_or_fq properly by having separate
-    # implemntations for this key instead of reusing the input_output_share_observers
+    # implementations for this key instead of reusing the input_output_share_observers
    # code
    # reuse_input_obs_or_fq = node.meta["target_dtype_info"].get("reuse_input_obs_or_fq", False)
    # for now we set this to False since reuse_input_obs_or_fq for
@ -1117,7 +1117,7 @@ def _maybe_insert_output_observer_for_node(
    reuse_input_obs_or_fq = False

    # Note: prev_output_dtype = torch.float and prev_output_is_dynamic=False
-    # because the prev_output is the output of an fp32 op, althought technically
+    # because the prev_output is the output of an fp32 op, although technically
    # we should get the dtype of the output from node.meta["val"] in the future
    # if we deprecate fx graph mode quantization
    needs_obs_or_fq = _needs_obs_or_fq(
@ -2002,7 +2002,7 @@ def prepare(
                same as input_quantized_idxs configuration provided
                for the standalone module
            standalone_module_output_quantized_idxs(List[Int]): a list of
-                indexs for the graph output that is quantized
+                indices for the graph output that is quantized
                same as input_quantized_idxs configuration provided
                for the standalone module
    """
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@ -190,7 +190,7 @@ def get_new_attr_name_with_prefix(prefix: str) -> Callable:


 def collect_producer_nodes(node: Node) -> Optional[list[Node]]:
-    r"""Starting from a target node, trace back until we hit inpu or
+    r"""Starting from a target node, trace back until we hit input or
    getattr node. This is used to extract the chain of operators
    starting from getattr to the target node, for example
    def forward(self, x):
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@ -358,7 +358,7 @@ class UniformQuantizationObserverBase(ObserverBase):
        # Functionally equivalent to 'determine_qparams' in utils.py. Observers must be torchscriptable however and qscheme
        # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
        # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code
-        # seems unlikey to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
+        # seems unlikely to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
        # TODO(jakeszwe, jerryzh168)
        if not check_min_max_valid(min_val, max_val):
            return torch.tensor([1.0], device=min_val.device.type), torch.tensor(
@ -1866,7 +1866,7 @@ class AffineQuantizedObserverBase(ABC, torch.nn.Module):
        Converts the observer node in the graph into its quantized representation

        Args:
-            model: graph module to conver the observer node in
+            model: graph module to convert the observer node in
            observer_node: the observer node to convert
        """
        from torch.ao.quantization.fx.utils import create_getattr_from_value
--- a/torch/ao/quantization/pt2e/_affine_quantization.py
+++ b/torch/ao/quantization/pt2e/_affine_quantization.py
@ -1,6 +1,6 @@
 # copied from https://github.com/pytorch/ao/blob/main/torchao/quantization/observer.py
 # and https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_primitives.py
-# PLESE DON'T MODIFY THIS FILE SO THAT WE DON'T GET OUT OF SYNC
+# PLEASE DON'T MODIFY THIS FILE SO THAT WE DON'T GET OUT OF SYNC
 import logging
 from abc import ABCMeta
 from typing import Any, Optional, Union
@ -469,7 +469,7 @@ def _quantize_affine_no_dtype_cast(
    1. figure out the dimension for reduction based on block_size, also reshape the input to align with
       the shape after reduction
    2. quantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
-    3. reshape the quantized result to origianl shape
+    3. reshape the quantized result to original shape
    """
    # TODO: validations
    # TODO: validate scale/zero_point dimensions are compatible with block_size
@ -619,7 +619,7 @@ def _dequantize_affine_no_dtype_check(
    1. figure out the dimension for reduction based on block_size, also reshape the input to align with
       the shape after reduction
    2. dequantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
-    3. reshape the quantized result to origianl shape and change dtype to the output_dtype
+    3. reshape the quantized result to original shape and change dtype to the output_dtype
    """
    assert len(block_size) == input.dim(), (
        f"Got input dim:{input.dim()}, block_size: {block_size}"
--- a/torch/ao/quantization/pt2e/port_metadata_pass.py
+++ b/torch/ao/quantization/pt2e/port_metadata_pass.py
@ -177,19 +177,19 @@ class PortNodeMetaForQDQ(PassBase):
        - Example 1:
          - Original: [Conv -> AvgPool -> Linear]
          - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> Linear -> Q -> DQ]
-          - Inner brackets specify which nodes Q/DQ inherit metdata from
+          - Inner brackets specify which nodes Q/DQ inherit metadata from
          - [Q-> [DQ -> Conv -> Q] -> [DQ -> AvgPool -> Q] -> [DQ -> Linear -> Q] -> DQ]
          - Note first Q and last DQ do not inherit metadata from any nodes
        - Example 2:
          - Original: [Conv -> AvgPool -> Linear]
          - AvgPool is not quantized
          - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> Linear -> Q -> DQ]
-          - Inner brackets specify which nodes Q/DQ inherit metdata from
+          - Inner brackets specify which nodes Q/DQ inherit metadata from
          - [Q-> [DQ -> Conv -> Q] -> DQ -> [AvgPool] -> Q -> [DQ -> Linear -> Q] -> DQ]
          - Note DQ and Q nodes around AvgPool do not inherit metadata from AvgPool because
            AvgPool was not supposed to be quantized. Metadata porting relies on quantization_annotation
-            on the nodes (in this case AvgPool node) to conclude if the node or patter was
-            supposed to be quantized. And subsequntly decide if the preceding Q, if any, should
+            on the nodes (in this case AvgPool node) to conclude if the node or pattern was
+            supposed to be quantized. And subsequently decide if the preceding Q, if any, should
            inherit metadata from AvgPool.
      - Dynamically quantized patterns:
        - Input that are dynamically quantized have choose_qparams, quantize and dequantize nodes
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@ -275,7 +275,7 @@ def _get_edge_or_node_to_group_id(

            _update_shared_with(input_edge, qspec, shared_with_map)

-    # now that we get the sharing relations between all edges and nodes, we can assingn group ids
+    # now that we get the sharing relations between all edges and nodes, we can assign group ids
    cur_group_id = 0
    edge_or_node_to_group_id: dict[EdgeOrNode, int] = {}
    for edge_or_node in shared_with_map.keys():
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@ -876,7 +876,7 @@ def _fold_conv_bn_qat(m: GraphModule) -> GraphModule:
            m, F.conv_transpose2d, _quantized_conv2d_bn_example_inputs, is_cuda=is_cuda
        )

-    # remove in place add from batchnorm tracking traning stats
+    # remove in place add from batchnorm tracking training stats
    for node in m.graph.nodes:
        if (
            node.target == torch.ops.aten.add_.Tensor
--- a/torch/ao/quantization/pt2e/representation/rewrite.py
+++ b/torch/ao/quantization/pt2e/representation/rewrite.py
@ -300,7 +300,7 @@ def _reference_quantized_conv2d(
    # Out_(i, j)_fp32 = ((X_scale * W_scale) * Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)]) + bias_(i)_fp32
    # In order to addition of bias_(i)_fp32 inside, we must do
    # Out_(i, j)_fp32 = (X_scale * W_scale) * (Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)] + (1 / (X_scale * W_scale)) * bias_(i)_fp32)W_scale  # noqa: B950
-    # Note we had to multiply bias_fp32 qith X_scale * W_scale = bias_scale
+    # Note we had to multiply bias_fp32 with X_scale * W_scale = bias_scale
    # Thus bias quantization to int32 must be with X_scale * W_scale

    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
@ -436,7 +436,7 @@ def _reference_quantized_add(
        x_fp32 = (x_i8 - x_zero_point) * x_scale         (3)
        y_fp32 = (y_i8 - y_zero_point) * y_scale         (4)

-        # applying the above fomula to the out_i8 equation we can get the following:
+        # applying the above formula to the out_i8 equation we can get the following:
        out_i8 = out_fp32 / out_scale + out_zero_point             # (1)
           = (x_f32 + y_f32) / out_scale + out_zero_point      # applying (2) to substitute out_fp32 with x_fp32 + y_fp32
           = ((x_i8 - x_zero_point) * x_scale + (y_i8 - y_zero_point) * y_scale) / out_scale + out_zero_point  # apply (3) and (4)
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@ -185,7 +185,7 @@ def _prepare_standalone_module_fx(
              same as input_quantized_idxs configuration provided
              for the standalone module
            * `standalone_module_output_quantized_idxs(List[Int])`: a list of
-              indexs for the graph output that is quantized
+              indices for the graph output that is quantized
              same as input_quantized_idxs configuration provided
              for the standalone module

--- a/torch/ao/quantization/quantize_pt2e.py
+++ b/torch/ao/quantization/quantize_pt2e.py
@ -76,7 +76,7 @@ def prepare_pt2e(

        # Step 1. program capture
        # NOTE: this API will be updated to torch.export API in the future, but the captured
-        # result shoud mostly stay the same
+        # result should mostly stay the same
        m = torch.export.export_for_training(m, *example_inputs).module()
        # we get a model with aten ops

@ -153,7 +153,7 @@ def prepare_qat_pt2e(

        # Step 1. program capture
        # NOTE: this API will be updated to torch.export API in the future, but the captured
-        # result shoud mostly stay the same
+        # result should mostly stay the same
        m = torch.export.export_for_training(m, *example_inputs).module()
        # we get a model with aten ops

@ -218,7 +218,7 @@ def convert_pt2e(

    Args:
      * `model` (torch.fx.GraphModule): calibrated/trained model
-      * `use_reference_representation` (bool): boolean flag to indicate whether to produce referece representation or not
+      * `use_reference_representation` (bool): boolean flag to indicate whether to produce reference representation or not
      * `fold_quantize` (bool): boolean flag for whether fold the quantize op or not

    Returns:
--- a/torch/ao/quantization/quantizer/quantizer.py
+++ b/torch/ao/quantization/quantizer/quantizer.py
@ -111,7 +111,7 @@ class DerivedQuantizationSpec(QuantizationSpecBase):

@dataclass
 class QuantizationAnnotation:
-    """How are input arguemnt or output should be quantized,
+    """How are input argument or output should be quantized,
    expressed as QuantizationSpec, this corresponds to how a Tensor in the
    operator Graph is observed (PTQ) or fake quantized (QAT)
    """
--- a/torch/ao/quantization/quantizer/utils.py
+++ b/torch/ao/quantization/quantizer/utils.py
@ -28,7 +28,7 @@ def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]):
    This utility is used to handle cases when dynami_shape=True tracing leads
    to symint nodes in the pattern of linear module. In those cases, we need to
    distinguish between the nodes that are in input for just extracting value of
-    some dimentions (and symint nodes) vs. the one that is activation.
+    some dimensions (and symint nodes) vs. the one that is activation.
    For example:
    graph(x, y, weight):
       size_0 = torch.ops.aten.sym_size([x], [0])
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@ -245,7 +245,7 @@ def _get_not_module_type_or_name_filter(
 class XNNPACKQuantizer(Quantizer):
    """
    !!! DEPRECATED !!!
-    XNNPACKQuantizer is a marked as deprected. It will be removed in the future.
+    XNNPACKQuantizer is a marked as deprecated. It will be removed in the future.
    It has been moved to executorch.backends.xnnpack.quantizer.xnnpack_quantizer.XNNPACKQuantizer.
    Please use the new quantizer instead.
    """
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@ -422,7 +422,7 @@ def _annotate_conv_bn(
    filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
    """
-    Find conv + batchnorm parititions
+    Find conv + batchnorm partitions
    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
    """
    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=False)
@ -435,7 +435,7 @@ def _annotate_conv_bn_relu(
    filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
    """
-    Find conv + batchnorm + relu parititions
+    Find conv + batchnorm + relu partitions
    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
    """
    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True)
@ -448,7 +448,7 @@ def _annotate_conv_transpose_bn(
    filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
    """
-    Find conv_transpose + batchnorm parititions
+    Find conv_transpose + batchnorm partitions
    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
    """
    return _do_annotate_conv_bn(
@ -463,7 +463,7 @@ def _annotate_conv_transpose_bn_relu(
    filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[list[list[Node]]]:
    """
-    Find conv_transpose + batchnorm + relu parititions
+    Find conv_transpose + batchnorm + relu partitions
    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
    """
    return _do_annotate_conv_bn(
--- a/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/xpu_inductor_quantizer.py
@ -85,7 +85,7 @@ class XPUInductorQuantizer(X86InductorQuantizer):
        overrides. We keep the annotate methods but make the function
        body empty, aiming to let `_generate_qdq_quantized_model`
        generate qdq around op and graph execute on fp32 dtype for
-        unspported operators.
+        unsupported operators.
    """

    def _annotate_qat_conv2d_fusion_pattern(
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@ -616,7 +616,7 @@ def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:

 # Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
 # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
-# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikey to change
+# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikely to change
 # (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. TODO(jakeszwe, jerryzh168)
 def determine_qparams(
    min_val: torch.Tensor,