diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index 2af7e6acb91..1b7c460c707 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -119,7 +119,7 @@ inner(torch.randn(20, 20, requires_grad=True) + 1)
         backend_name = "relu_compile_error_TESTING_ONLY"
         run_code = f"""\
 class CpuCudaModule(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.m_x = torch.nn.Linear(20, 20).cuda()
         self.m_y = torch.nn.Linear(20, 20)
@@ -149,7 +149,7 @@ inner(torch.randn(20, 20).cuda(), torch.randn(20, 20))
             res.minifier_module(),
             """\
 class Repro(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.G__mod___m_x = Linear(in_features=20, out_features=20, bias=True).cuda()
         self.G__mod___m_y = Linear(in_features=20, out_features=20, bias=True)
@@ -204,7 +204,7 @@ inner(torch.randn(20, 20))
             res.repro_module(),
             """\
 class Repro(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     def forward(self, x_19):
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index d7e8e530648..45d4a79decf 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -122,7 +122,7 @@ inner(torch.randn(20))
             res.repro_module(),
             """\
 class Repro(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     def forward(self, arg0_1):
@@ -138,7 +138,7 @@ class Repro(torch.nn.Module):
             res.repro_module(),
             """\
 class Repro(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     def forward(self, arg0_1):
diff --git a/torch/_classes.py b/torch/_classes.py
index 58b34745352..069f13dcb67 100644
--- a/torch/_classes.py
+++ b/torch/_classes.py
@@ -19,7 +19,7 @@ class _ClassNamespace(types.ModuleType):
 class _Classes(types.ModuleType):
     __file__ = "_classes.py"
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("torch.classes")
 
     def __getattr__(self, name):
diff --git a/torch/_decomp/decompositions_for_rng.py b/torch/_decomp/decompositions_for_rng.py
index 66bd33075a5..a62a28f783b 100644
--- a/torch/_decomp/decompositions_for_rng.py
+++ b/torch/_decomp/decompositions_for_rng.py
@@ -71,7 +71,7 @@ class PhiloxState:
     trace time.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.reset()
 
     def reset(self):
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index a58571c77c9..8d65f1670ae 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -247,7 +247,7 @@ class SubmodCompiler(torch.fx.interpreter.Interpreter):
             # This gives us the appropriately strided outputs here which will reflect runtime strides.
 
             class FakeifyFirstAOTInvocationGuard:
-                def __init__(self):
+                def __init__(self) -> None:
                     self.tc = torch._guards.TracingContext.try_get()
                     assert self.tc
                     torch._guards.TracingContext.try_get().fakify_first_call = True
diff --git a/torch/_dynamo/code_context.py b/torch/_dynamo/code_context.py
index 59c912bd30f..727aad93495 100644
--- a/torch/_dynamo/code_context.py
+++ b/torch/_dynamo/code_context.py
@@ -5,7 +5,7 @@ from .utils import ExactWeakKeyDictionary
 
 
 class CodeContextDict:
-    def __init__(self):
+    def __init__(self) -> None:
         self.code_context = ExactWeakKeyDictionary()
 
     def has_context(self, code: types.CodeType):
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 5e9656f2068..49d9b302fae 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -170,7 +170,7 @@ class NNModuleToString:
             """
             from torch.nn import *
             class Repro(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
             """
         )
@@ -491,7 +491,7 @@ _is_leaf_or_default = _mk_defaulter(False)
 
 
 class NopInputReader:
-    def __init__(self):
+    def __init__(self) -> None:
         self.total = 0
 
     def storage(self, storage_hash, nbytes, *, device=None, dtype_hint=None):
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 2f7f1c243f4..797c64eaf97 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -497,7 +497,7 @@ class _TorchDynamoContext:
                         wrapper function.
 
                         >> class CallableClass:
-                        >>     def __init__(self):
+                        >>     def __init__(self) -> None:
                         >>         super().__init__()
                         >>         self.relu = torch.nn.ReLU()
                         >>
@@ -578,7 +578,7 @@ class OptimizeContext(_TorchDynamoContext):
 
 
 class RunOnlyContext(_TorchDynamoContext):
-    def __init__(self):
+    def __init__(self) -> None:
         # cudagraph trees relies on generation increment
         def on_enter():
             torch._dynamo.mutation_guard.GenerationTracker.generation += 1
@@ -590,7 +590,7 @@ class RunOnlyContext(_TorchDynamoContext):
 
 
 class DisableContext(_TorchDynamoContext):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(callback=None)
 
     def __call__(self, fn):
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 2ca862c0087..5a0915a9727 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -74,7 +74,7 @@ class InvalidBackend(TorchDynamoException):
 
 
 class ResetRequired(TorchDynamoException):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(
             textwrap.dedent(
                 """
diff --git a/torch/_dynamo/profiler.py b/torch/_dynamo/profiler.py
index b7e9553ce21..841ab87cdf6 100644
--- a/torch/_dynamo/profiler.py
+++ b/torch/_dynamo/profiler.py
@@ -92,7 +92,7 @@ def print_missing(stack):
 class Profiler:
     unique_graphs = 0
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.prof = torch.profiler.profile(
             activities=[torch.profiler.ProfilerActivity.CPU],
             with_stack=should_print_missing(),
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 09752822dd8..5353327d98f 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -70,7 +70,7 @@ class MutableLocal(MutableLocalBase):
     state.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(MutableLocalSource.Local)
 
     def __hash__(self):
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index b1bb7b515dc..7f22f787d52 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -274,7 +274,7 @@ class GraphArg:
 
 
 class BackwardStateGraphArg(GraphArg):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(
             source=None,
             _example=BackwardState(),
@@ -2646,7 +2646,7 @@ class SourcelessBuilder:
     if/else type->VariableTracker trees that were cropping up all over dynamo.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         raise AssertionError("Use SourcelessBuilder.create()")
 
     @staticmethod
diff --git a/torch/_export/db/examples/class_method.py b/torch/_export/db/examples/class_method.py
index 5d7f8b5b705..f701f54d4f4 100644
--- a/torch/_export/db/examples/class_method.py
+++ b/torch/_export/db/examples/class_method.py
@@ -10,7 +10,7 @@ class ClassMethod(torch.nn.Module):
     def method(cls, x):
         return x + 1
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear = torch.nn.Linear(4, 2)
 
diff --git a/torch/_export/db/examples/cond_branch_class_method.py b/torch/_export/db/examples/cond_branch_class_method.py
index 9ce4a9d6f34..22600cc5043 100644
--- a/torch/_export/db/examples/cond_branch_class_method.py
+++ b/torch/_export/db/examples/cond_branch_class_method.py
@@ -26,7 +26,7 @@ class CondBranchClassMethod(torch.nn.Module):
     NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.subm = MySubModule()
 
diff --git a/torch/_export/db/examples/model_attr_mutation.py b/torch/_export/db/examples/model_attr_mutation.py
index dfebbebd8b1..4aa623c7dc3 100644
--- a/torch/_export/db/examples/model_attr_mutation.py
+++ b/torch/_export/db/examples/model_attr_mutation.py
@@ -8,7 +8,7 @@ class ModelAttrMutation(torch.nn.Module):
     Attribute mutation is not supported.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.attr_list = [torch.randn(3, 2), torch.randn(3, 2)]
 
diff --git a/torch/_export/db/examples/scalar_output.py b/torch/_export/db/examples/scalar_output.py
index 83dd3637967..86d3b464533 100644
--- a/torch/_export/db/examples/scalar_output.py
+++ b/torch/_export/db/examples/scalar_output.py
@@ -11,7 +11,7 @@ class ScalarOutput(torch.nn.Module):
     Returning scalar values from the graph is supported, in addition to Tensor
     outputs. Symbolic shapes are captured and rank is specialized.
     """
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     def forward(self, x):
diff --git a/torch/_export/db/examples/specialized_attribute.py b/torch/_export/db/examples/specialized_attribute.py
index 39f7314bec7..f17092f9afc 100644
--- a/torch/_export/db/examples/specialized_attribute.py
+++ b/torch/_export/db/examples/specialized_attribute.py
@@ -11,7 +11,7 @@ class SpecializedAttribute(torch.nn.Module):
     Model attributes are specialized.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.a = "moo"
         self.b = 4
diff --git a/torch/_export/passes/lift_constants_pass.py b/torch/_export/passes/lift_constants_pass.py
index 823c66d2bc0..08d93287d32 100644
--- a/torch/_export/passes/lift_constants_pass.py
+++ b/torch/_export/passes/lift_constants_pass.py
@@ -24,7 +24,7 @@ class ConstantAttrMap(collections.abc.MutableMapping):
     if that's the case).
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         # Underlying dict that we use to implement this mapping.
         self._constant_attrs: Dict[
             Union[int, torch.Tensor, FakeScriptObject], List[Any]
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index ae0f6e39f23..28509b83416 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -1413,7 +1413,7 @@ class GraphModuleDeserializer(metaclass=Final):
         constants: Dict[str, Union[torch.Tensor, FakeScriptObject, torch.ScriptObject]]
         example_inputs: Optional[Tuple[Tuple[torch.Tensor, ...], Dict[str, Any]]]
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.serialized_name_to_node: Dict[str, torch.fx.Node] = {}
         self.serialized_name_to_meta: Dict[str, MetaType] = {}
         self.graph = torch.fx.Graph()
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index 77f1cf22014..9b1b40b4830 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -602,7 +602,7 @@ class SubclassMeta:
     # Optional field because we don't compute for inference graphs
     grad_input_metas: Optional[List[Union[int, SubclassCreationMeta]]] = None
 
-    def __init__(self):
+    def __init__(self) -> None:
         # The fields in this class get set after its construction.
         pass
 
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index b7bd95a3ed4..e9fedb3d53c 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -878,7 +878,7 @@ def aot_module(mod: nn.Module, *args, **kwargs) -> nn.Module:
     )
 
     class AOTModule(nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.orig_module = mod
 
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
index f80b7dee55b..270c1895f6f 100644
--- a/torch/_functorch/autograd_function.py
+++ b/torch/_functorch/autograd_function.py
@@ -30,7 +30,7 @@ from torch.autograd.forward_ad import _set_fwd_grad_enabled
 # We do this by using creating a custom HigherOrderOperator that only functorch
 # dispatches specially.
 class CustomFunctionHigherOrderOperator(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("custom_function_call")
 
     def __call__(self, autograd_function, *args, **kwargs):
@@ -713,7 +713,7 @@ def autograd_function_forward_rewritten(original_forward, original_setup_context
 
 
 class AutogradFunctionApply(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("autograd_function_apply")
 
     def __call__(self, fwd, bwd, *fwd_args, **fwd_kwargs):
diff --git a/torch/_guards.py b/torch/_guards.py
index 3465f6e62ee..3fa9b57d300 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -427,7 +427,7 @@ class ModuleContextCheckpointState:
 
 
 class ModuleContext(Checkpointable[ModuleContextCheckpointState]):
-    def __init__(self):
+    def __init__(self) -> None:
         self.nn_modules: Dict[str, Any] = {}
 
     def copy_graphstate(self):
@@ -476,7 +476,7 @@ class GlobalContext(Checkpointable[GlobalContextCheckpointState]):
         "autocast_cache_enabled",
     }
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.global_state: Dict[str, Tuple[Callable, ...]] = {}
 
     def copy_graphstate(self):
@@ -544,7 +544,7 @@ class GuardsSet:
 
 
 class GuardsContext(Checkpointable[GuardsCheckpointState]):
-    def __init__(self):
+    def __init__(self) -> None:
         self.dynamo_guards: GuardsSet = GuardsSet()
         self.aotautograd_guards: List[GuardEnvExpr] = []
 
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index 40178fa750f..00f43e6acde 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -54,7 +54,7 @@ class AutoFunctionalized(HigherOrderOperator):
     underscore is to prevent collisions with kwarg names in **kwargs.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("auto_functionalized")
 
     def __call__(
diff --git a/torch/_higher_order_ops/effects.py b/torch/_higher_order_ops/effects.py
index f20c87c7e58..3bba77a5cfc 100644
--- a/torch/_higher_order_ops/effects.py
+++ b/torch/_higher_order_ops/effects.py
@@ -55,7 +55,7 @@ class WithEffects(HigherOrderOperator):
     per "effect type", which are enumerated in the _EffectType enum.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("with_effects")
 
     def __call__(
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index e3c9d718b2f..992c7398b2a 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -38,7 +38,7 @@ class TransformGetItemToIndex(TorchFunctionMode):
 
 
 class FlexAttentionHOP(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("flex_attention")
 
     def __call__(
@@ -74,7 +74,7 @@ flex_attention.__module__ = "torch.ops.higher_order"
 
 
 class FlexAttentionBackwardHOP(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("flex_attention_backward")
 
     def __call__(
diff --git a/torch/_higher_order_ops/out_dtype.py b/torch/_higher_order_ops/out_dtype.py
index 5c9ca4f3f16..d1557909427 100644
--- a/torch/_higher_order_ops/out_dtype.py
+++ b/torch/_higher_order_ops/out_dtype.py
@@ -45,7 +45,7 @@ class OutDtypeOperator(HigherOrderOperator):
         3. Cast the output to `out_dtype`
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("out_dtype")
         # TODO(ydwu4): Subclassing HigherOrderOperator causes __module__ to
         # become different (torch._higher_order_ops.out_dtype) which will result
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index 779ab2838b3..ff01b0c0124 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -519,7 +519,7 @@ def identify_mutated_tensors(kernel, kwargs):
 
 # Used for wrapping a Triton Kernel
 class TritonKernelWrapperMutation(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("triton_kernel_wrapper_mutation")
 
 
@@ -528,7 +528,7 @@ triton_kernel_wrapper_mutation = TritonKernelWrapperMutation()
 
 # Used for wrapping a Triton Kernel in a functional manner
 class TritonKernelWrapperFunctional(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("triton_kernel_wrapper_functional")
 
 
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 4924e1f3d44..e19fa162105 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -18,7 +18,7 @@ from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_ten
 
 
 class WhileLoopOp(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("while_loop")
 
     def __call__(
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index a26253405c4..d6faef20661 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -15,7 +15,7 @@ uid = itertools.count(1)
 
 # Used for testing the HigherOrderOperator mechanism
 class Wrap(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("wrap")
 
     def __call__(self, func, *args, **kwargs):
@@ -36,7 +36,7 @@ wrap = Wrap()
 
 
 class WrapWithSetGradEnabled(HigherOrderOperator):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("wrap_with_set_grad_enabled")
 
     def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
@@ -74,7 +74,7 @@ class WrapActivationCheckpoint(HigherOrderOperator):
     partitioners. See TagActivationCheckpoint for more information.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("wrap_activation_checkpoint")
 
     def __call__(self, function, *args, **kwargs):
@@ -113,7 +113,7 @@ class TagActivationCheckpoint(HigherOrderOperator):
     the forward and recomputed forward in backward.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__("tag_activation_checkpoint")
 
     @staticmethod
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 9b4830159cf..b9159435d4a 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1560,7 +1560,7 @@ class CSE:
 
 
 class CodeGen:
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.exit_stack = contextlib.ExitStack()
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_cuda.py b/torch/_inductor/codegen/cpp_wrapper_cuda.py
index 8eed428de07..3def5af40af 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cuda.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cuda.py
@@ -29,7 +29,7 @@ class CppWrapperCuda(CppWrapperCpu):
     Generates cpp wrapper for running on GPU and calls CUDA kernels
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.device = "cuda"
         super().__init__()
         self.grid_id = count()
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 711d749b650..3516ba3b0e6 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1113,7 +1113,7 @@ class HelperFunctions:
     _templates_seen: Dict[str, str]  # Template code to function name
     finalized_helpers: List[str]
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._templates_seen = {}
         self.finalized_helpers = []
 
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 335f437af08..36dedd878f8 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -589,7 +589,7 @@ def canonicalization_prefix():
 class FreeUnbackedSymbolsOpsHandler:
     symbols: OrderedSet[sympy.Symbol]
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.symbols = OrderedSet()
 
     def __getattr__(self, name: str) -> Callable[..., Any]:
diff --git a/torch/_inductor/exc.py b/torch/_inductor/exc.py
index 07c1eebf99b..2505c8a3119 100644
--- a/torch/_inductor/exc.py
+++ b/torch/_inductor/exc.py
@@ -65,7 +65,7 @@ class SubgraphLoweringException(RuntimeError):
 
 
 class InvalidCxxCompiler(RuntimeError):
-    def __init__(self):
+    def __init__(self) -> None:
         from . import config
 
         super().__init__(
diff --git a/torch/_inductor/fx_passes/misc_patterns.py b/torch/_inductor/fx_passes/misc_patterns.py
index d7873fede3c..0f608952a2f 100644
--- a/torch/_inductor/fx_passes/misc_patterns.py
+++ b/torch/_inductor/fx_passes/misc_patterns.py
@@ -79,7 +79,7 @@ class NumpyCompatNormalization:
     inverse_mapping: Dict[str, str]
     cache: Dict["torch.fx.graph.Target", Set[str]]
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.cache = {}  # callable -> tuple of replaceable args e.g. ["axis"]
         self.inverse_mapping = {}
         for actual_kwarg, numpy_kwargs in self.numpy_compat.items():
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index 34ddbf90b7f..c930608c766 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -1207,7 +1207,7 @@ if torch._C._has_mkldnn:
         Combine packed weight nodes with the same inputs to reduce memory usage.
         for example:
         class Model(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = nn.Linear(32, 32, bias=True)
 
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index 18e00b090ce..5c26e322f12 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -99,7 +99,7 @@ class CachedMetricsHelper:
     apply on a cache hit.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.cached_metrics = {}
         for metric in get_metric_fields():
             self.cached_metrics[metric] = globals()[metric]
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 380fbe515c3..d5475b8e14e 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -940,7 +940,7 @@ class IndentedBuffer:
 
 
 class FakeIndentedBuffer(IndentedBuffer):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     def __getattribute__(self, name):
@@ -1219,7 +1219,7 @@ class DebugDirManager:
     counter = itertools.count(0)
     prev_debug_name: str
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.id = next(DebugDirManager.counter)
 
     def __enter__(self):
@@ -1268,7 +1268,7 @@ def get_code(fn, *args, **kwargs):
         class DummyModule:
             """This is empty to replace the generated triton module"""
 
-            def __init__(self):
+            def __init__(self) -> None:
                 pass
 
             def call(self, *args, **kwargs):
diff --git a/torch/_lazy/closure.py b/torch/_lazy/closure.py
index 32b2c58ba2b..94c12c075a0 100644
--- a/torch/_lazy/closure.py
+++ b/torch/_lazy/closure.py
@@ -7,7 +7,7 @@ from torch._lazy.device_context import get_device_context
 
 
 class ClosureHandler:
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def run(self, closure):
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index a56f138f4b0..213e88ac3e5 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -42,7 +42,7 @@ class HasStaticMethodFromReal(Protocol):
 
 
 class FakeClassRegistry:
-    def __init__(self):
+    def __init__(self) -> None:
         self._registered_class: Dict[str, Any] = {}
 
     def has_impl(self, full_qualname: str) -> bool:
diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py
index 644cf92fda2..2dfdbb296a4 100644
--- a/torch/_python_dispatcher.py
+++ b/torch/_python_dispatcher.py
@@ -70,7 +70,7 @@ class PythonDispatcher:
     ]
     supported_keys = runtime_keys + alias_keys
 
-    def __init__(self):
+    def __init__(self) -> None:
         C._dispatch_check_invariants(self.name)  # type: ignore[attr-defined]
         self.ref = C._dispatch_library("FRAGMENT", self.namespace, "")
         self.ref.def_("foo(Tensor x) -> Tensor")
diff --git a/torch/_subclasses/schema_check_mode.py b/torch/_subclasses/schema_check_mode.py
index d8843eec810..d7ad9ebd281 100644
--- a/torch/_subclasses/schema_check_mode.py
+++ b/torch/_subclasses/schema_check_mode.py
@@ -60,7 +60,7 @@ def clone_inputs(args):
 
 
 class SchemaCheckMode(TorchDispatchMode):
-    def __init__(self):
+    def __init__(self) -> None:
         # Information recorded for testing purposes. For example:
         #  - incorrect schemas
         #  - overly conservative schemas
diff --git a/torch/ao/nn/quantized/modules/functional_modules.py b/torch/ao/nn/quantized/modules/functional_modules.py
index b707a1f681c..45dc7fc0444 100644
--- a/torch/ao/nn/quantized/modules/functional_modules.py
+++ b/torch/ao/nn/quantized/modules/functional_modules.py
@@ -36,7 +36,7 @@ class FloatFunctional(torch.nn.Module):
         - mul_scalar
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.activation_post_process = torch.nn.Identity()
 
@@ -190,7 +190,7 @@ class QFunctional(torch.nn.Module):
         - mul_scalar
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.scale = 1.0
         self.zero_point = 0
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
index a7c0f0a27f6..8cd4190110f 100644
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -72,7 +72,7 @@ class QConfigMultiMapping:
 
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         # initialize this with 1 QConfigMapping to avoid corner cases
         self.qconfig_mappings_list: List[QConfigMapping] = [QConfigMapping()]
 
diff --git a/torch/ao/pruning/_experimental/pruner/README.md b/torch/ao/pruning/_experimental/pruner/README.md
index 026fd33b287..2885dff0402 100644
--- a/torch/ao/pruning/_experimental/pruner/README.md
+++ b/torch/ao/pruning/_experimental/pruner/README.md
@@ -99,7 +99,7 @@ from torch.ao.pruning._experimental.pruner import SaliencyPruner
 
 # Define model
 class Model(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Linear(700, 500, bias=True),
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index 57cc1df04d5..8ef266ebe47 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -85,7 +85,7 @@ class FakeQuantizeBase(ABC, Module):
     fake_quant_enabled: torch.Tensor
     observer_enabled: torch.Tensor
 
-    def __init__(self):
+    def __init__(self) -> None:
         """Set fake_quant_enabled and observer_enabled."""
         super().__init__()
         # fake_quant_enabled and observer_enabled are buffers to support their
diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index a8bd154791b..ca116b282e7 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -70,7 +70,7 @@ In the following, I’ll first have a detailed description for each step, and th
 
 ```
 class LinearReLUModule(torch.nn.Module):
-   def __init__(self):
+   def __init__(self) -> None:
        super().__init__()
        self.linear = torch.nn.Linear(5, 10).float()
        self.relu = torch.nn.ReLU()
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index 534e73bfb0a..9db118a3365 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -137,7 +137,7 @@ class DetectorBase(ABC):
         - Should return a str-based report and dict info in Tuple[str,Dict] format
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.detector_config_info = None
 
diff --git a/torch/ao/quantization/fx/custom_config.py b/torch/ao/quantization/fx/custom_config.py
index 7aa408f0ceb..cb00c95fdee 100644
--- a/torch/ao/quantization/fx/custom_config.py
+++ b/torch/ao/quantization/fx/custom_config.py
@@ -63,7 +63,7 @@ class PrepareCustomConfig:
             .set_preserved_attributes(["attr1", "attr2"])
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.standalone_module_names: Dict[str, StandaloneModuleConfigEntry] = {}
         self.standalone_module_classes: Dict[Type, StandaloneModuleConfigEntry] = {}
         self.float_to_observed_mapping: Dict[QuantType, Dict[Type, Type]] = {}
@@ -382,7 +382,7 @@ class ConvertCustomConfig:
             .set_preserved_attributes(["attr1", "attr2"])
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.observed_to_quantized_mapping: Dict[QuantType, Dict[Type, Type]] = {}
         self.preserved_attributes: List[str] = []
 
@@ -477,7 +477,7 @@ class FuseCustomConfig:
         fuse_custom_config = FuseCustomConfig().set_preserved_attributes(["attr1", "attr2"])
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.preserved_attributes: List[str] = []
 
     def __repr__(self):
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 64b14b50614..e26f0302711 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1568,7 +1568,7 @@ class ReuseInputObserver(ObserverBase):
     Note: this is only enabled in FX Graph Mode Quantization
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(torch.quint8, is_dynamic=False)
 
     def forward(self, x):
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 1b4d9cecbf3..2c12be74ce6 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -229,7 +229,7 @@ class QConfigMapping:
 
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         # In increasing match priority:
         self.global_qconfig: QConfigAny = None
         self.object_type_qconfigs: OrderedDict[
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index f5949d985f9..dd8f3e811a3 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -289,7 +289,7 @@ def prepare_fx(
         from torch.ao.quantization.quantize_fx import prepare_fx
 
         class Submodule(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 5)
             def forward(self, x):
@@ -297,7 +297,7 @@ def prepare_fx(
                 return x
 
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 5)
                 self.sub = Submodule()
@@ -427,7 +427,7 @@ def prepare_qat_fx(
         from torch.ao.quantization.quantize_fx import prepare_qat_fx
 
         class Submodule(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 5)
             def forward(self, x):
@@ -435,7 +435,7 @@ def prepare_qat_fx(
                 return x
 
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 5)
                 self.sub = Submodule()
diff --git a/torch/ao/quantization/quantize_pt2e.py b/torch/ao/quantization/quantize_pt2e.py
index 41676934440..1e1848a6ff0 100644
--- a/torch/ao/quantization/quantize_pt2e.py
+++ b/torch/ao/quantization/quantize_pt2e.py
@@ -56,7 +56,7 @@ def prepare_pt2e(
         )
 
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 10)
 
@@ -129,7 +129,7 @@ def prepare_qat_pt2e(
         )
 
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 10)
 
diff --git a/torch/ao/quantization/quantizer/embedding_quantizer.py b/torch/ao/quantization/quantizer/embedding_quantizer.py
index 6c93c0b88a1..32ec3814637 100644
--- a/torch/ao/quantization/quantizer/embedding_quantizer.py
+++ b/torch/ao/quantization/quantizer/embedding_quantizer.py
@@ -42,7 +42,7 @@ def get_embedding_operators_config() -> OperatorConfig:
 
 
 class EmbeddingQuantizer(Quantizer):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     @classmethod
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index 09db71a191b..574af30a715 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -436,7 +436,7 @@ class X86InductorQuantizer(Quantizer):
     supported_config_and_operators = _get_supported_config_and_operators()
     module_function_to_aten_operator_type = _map_module_function_to_aten_operator_type()
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.global_config: Optional[QuantizationConfig] = None
         self.operator_type_qconfig: Dict[
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
index 93712ded503..cc17057c82a 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -268,7 +268,7 @@ class XNNPACKQuantizer(Quantizer):
         "linear",
     ]
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.global_config: Optional[QuantizationConfig] = None
         self.operator_type_config: Dict[
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index dad16df5b93..ff22da04a22 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -513,7 +513,7 @@ def _get_path_of_module(
     Example::
 
     >> class M(torch.nn.Module):
-           def __init__(self):
+           def __init__(self) -> None:
                self.linear = torch.nn.Linear(5, 5)
            def forward(self, x):
                return self.linear(x)
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index e3fc95580c5..67eb989f57c 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -645,7 +645,7 @@ class FunctionEvent(FormattedTimesMixin):
 class FunctionEventAvg(FormattedTimesMixin):
     """Used to average stats over multiple FunctionEvent objects."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.key: Optional[str] = None
         self.count: int = 0
         self.node_id: int = 0
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index bdf07e28617..634c50da4db 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -266,7 +266,7 @@ class _Launcher:
 or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
 {expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.cpuinfo = _CPUinfo()
 
     def add_lib_preload(self, lib_type):
diff --git a/torch/csrc/jit/backends/backend_debug_handler.h b/torch/csrc/jit/backends/backend_debug_handler.h
index d25ce2f8cb0..d4b00fe340f 100644
--- a/torch/csrc/jit/backends/backend_debug_handler.h
+++ b/torch/csrc/jit/backends/backend_debug_handler.h
@@ -77,17 +77,17 @@ namespace jit {
  *
  *  So why does debug handle map to DebugInfoTuple = {source range and inlined
  *  cs}? {debug_handle, source_range_tag, serialized_callstack} Take this
- *  example: class L(nn.Module): def __init__(self):
+ *  example: class L(nn.Module): def __init__(self) -> None:
  *      ...
  *    def forward(self, x):
  *      return x * 5
  *  class M(nn.Module):
- *    def __init__(self):
+ *    def __init__(self) -> None:
  *      ...
  *    def forward(self, x):
  *      return x - 2
  *  class N(nn.Module):
- *    def __init__(self):
+ *    def __init__(self) -> None:
  *      self.m = M()
  *    def forward(self, x):
  *      return self.m(x) + 3
diff --git a/torch/csrc/jit/docs/serialization.md b/torch/csrc/jit/docs/serialization.md
index 106cea55478..3fb463c7e7f 100644
--- a/torch/csrc/jit/docs/serialization.md
+++ b/torch/csrc/jit/docs/serialization.md
@@ -328,7 +328,7 @@ For example:
 
 ```
 class M(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         self.a = torch.rand(2, 3)
         self.b = torch.nn.Linear(10, 10)
 
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
index 61679972073..ce995276d28 100644
--- a/torch/csrc/jit/operator_upgraders/README.md
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -37,7 +37,7 @@ When making changes to the operators, the first thing to identify is if it's BC/
     1. Add a test module in `test/jit/fixtures_srcs/fixtures_src.py`. In `test/jit/fixtures_srcs/generate_models.py`,
   ```
   class TestVersionedLinspaceV7(torch.nn.Module):
-      def __init__(self):
+      def __init__(self) -> None:
           super().__init__()
 
       def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
@@ -163,7 +163,7 @@ When making changes to the operators, the first thing to identify is if it's BC/
 
             # Step 2. Write down how current module should look like
             class MyModuleFloat(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
 
                 def forward(self, a, b: float):
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.h b/torch/csrc/jit/passes/onnx/function_extraction.h
index 3a90967e2f1..40555f8e356 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.h
+++ b/torch/csrc/jit/passes/onnx/function_extraction.h
@@ -25,7 +25,7 @@ namespace onnx {
 //
 // clang-format off
 // class M(torch.nn.Module):
-//     def __init__(self):
+//     def __init__(self) -> None:
 //         super().__init__()
 //         self.lns = torch.nn.ModuleList([torch.nn.LayerNorm(3, eps = i) for i in range(2)])
 //         self.celu1 = torch.nn.CELU(1.0)
diff --git a/torch/csrc/lazy/test_mnist.py b/torch/csrc/lazy/test_mnist.py
index a3a03d9844d..762620fcc62 100644
--- a/torch/csrc/lazy/test_mnist.py
+++ b/torch/csrc/lazy/test_mnist.py
@@ -17,7 +17,7 @@ torch._lazy.ts_backend.init()
 
 
 class Net(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
diff --git a/torch/csrc/lazy/tutorial.md b/torch/csrc/lazy/tutorial.md
index 155e8adfdd8..b72ae13eca7 100644
--- a/torch/csrc/lazy/tutorial.md
+++ b/torch/csrc/lazy/tutorial.md
@@ -135,7 +135,7 @@ Here's our model definition:
 
 ```python
 class Net(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
index f9ce311725e..34cd7bacee0 100644
--- a/torch/cuda/_sanitizer.py
+++ b/torch/cuda/_sanitizer.py
@@ -163,7 +163,7 @@ class TensorInfo:
 
 
 class _TensorsAccessed:
-    def __init__(self):
+    def __init__(self) -> None:
         self.accesses: Dict[DataPtr, TensorInfo] = {}
 
     def ensure_tensor_exists(self, data_ptr: DataPtr) -> None:
@@ -218,7 +218,7 @@ class _TensorsAccessed:
 
 
 class StreamSynchronizations:
-    def __init__(self):
+    def __init__(self) -> None:
         self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {}
         self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {}
         self.host_sync_state: Dict[StreamId, SeqNum] = {}
@@ -338,7 +338,7 @@ class EventHandler:
     data race.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.tensors_accessed = _TensorsAccessed()
         self.syncs = StreamSynchronizations()
         self.seq_num: SeqNum = 0
@@ -478,7 +478,7 @@ def zip_arguments(
 
 
 class ArgumentHandler:
-    def __init__(self):
+    def __init__(self) -> None:
         self.dataptrs_read: Set[DataPtr] = set()
         self.dataptrs_written: Set[DataPtr] = set()
         self.tensor_aliases: Dict[DataPtr, List[str]] = {}
@@ -527,7 +527,7 @@ class ArgumentHandler:
 
 
 class CUDASanitizerDispatchMode(TorchDispatchMode):
-    def __init__(self):
+    def __init__(self) -> None:
         self.event_handler = EventHandler()
         torch._C._activate_gpu_trace()
         gpu_trace.register_callback_for_event_creation(
@@ -596,7 +596,7 @@ class CUDASanitizer:
     This approach was deemed more elegant than using the atexit module.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.dispatch = CUDASanitizerDispatchMode()
         self.enabled = False
 
diff --git a/torch/distributed/_composable/checkpoint_activation.py b/torch/distributed/_composable/checkpoint_activation.py
index fcee2a57a07..88253abb4b9 100644
--- a/torch/distributed/_composable/checkpoint_activation.py
+++ b/torch/distributed/_composable/checkpoint_activation.py
@@ -49,7 +49,7 @@ def checkpoint(module: nn.Module, **kwargs) -> nn.Module:
         >>> import torch.nn as nn
         >>>
         >>> class MyModel(nn.Module):
-        >>>     def __init__(self):
+        >>>     def __init__(self) -> None:
         >>>         super().__init__()
         >>>         self.l1 = nn.Linear(10, 10)
         >>>         self.l2 = nn.Linear(10, 10)
diff --git a/torch/distributed/_composable/contract.py b/torch/distributed/_composable/contract.py
index 850659fc2c0..e7cd1713fae 100644
--- a/torch/distributed/_composable/contract.py
+++ b/torch/distributed/_composable/contract.py
@@ -47,7 +47,7 @@ def contract(state_cls: Type[_State] = _State):
         >>> import torch.nn as nn
         >>>
         >>> class MyModel(nn.Module):
-        >>>     def __init__(self):
+        >>>     def __init__(self) -> None:
         >>>         super().__init__()
         >>>         self.l1 = nn.Linear(10, 10)
         >>>         self.l2 = nn.Linear(10, 10)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_state.py b/torch/distributed/_composable/fsdp/_fsdp_state.py
index 6f62c01600d..6dde573d848 100644
--- a/torch/distributed/_composable/fsdp/_fsdp_state.py
+++ b/torch/distributed/_composable/fsdp/_fsdp_state.py
@@ -43,7 +43,7 @@ logger = logging.getLogger("torch.distributed._composable.fsdp")
 class FSDPStateContext:
     """This has state shared across FSDP states."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         # All FSDP states in the root state's module tree
         self.all_states: List[FSDPState] = []
         # Iteration's forward root runs the once-per-forward logic; this root
@@ -71,7 +71,7 @@ def disable_if_config_true(func):
 
 
 class FSDPState(_State):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._fsdp_param_group: Optional[FSDPParamGroup] = None
         self._is_root: Optional[bool] = None  # root set during lazy init
diff --git a/torch/distributed/_shard/sharding_plan/api.py b/torch/distributed/_shard/sharding_plan/api.py
index a7552c5a68f..d141df1a521 100644
--- a/torch/distributed/_shard/sharding_plan/api.py
+++ b/torch/distributed/_shard/sharding_plan/api.py
@@ -38,7 +38,7 @@ class ShardingPlan:
 
         >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
         >>> class MyModule(nn.Module):
-        >>>     def __init__(self):
+        >>>     def __init__(self) -> None:
         >>>        super().__init__()
         >>>        self.fc1 = nn.Linear()
         >>>        self.gelu = nn.GELU()
diff --git a/torch/distributed/_tensor/README.md b/torch/distributed/_tensor/README.md
index 80fcc2eb41f..2fedb7cc3b4 100644
--- a/torch/distributed/_tensor/README.md
+++ b/torch/distributed/_tensor/README.md
@@ -117,7 +117,7 @@ import torch.nn as nn
 from torch.distributed._tensor import Shard, distribute_tensor, distribute_module, init_device_mesh
 
 class MyModule(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = nn.Linear(8, 8)
         self.fc2 = nn.Linear(8, 8)
diff --git a/torch/distributed/_tensor/examples/checkpoint_example.py b/torch/distributed/_tensor/examples/checkpoint_example.py
index 1701e28ac2c..fe8585c2a23 100644
--- a/torch/distributed/_tensor/examples/checkpoint_example.py
+++ b/torch/distributed/_tensor/examples/checkpoint_example.py
@@ -25,7 +25,7 @@ from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_modul
 
 
 class SimpleMLP(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.net1 = torch.nn.Linear(5, 128)
         self.relu = torch.nn.ReLU()
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
index 14084485193..f7c95100b1b 100644
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@@ -55,7 +55,7 @@ class Joinable(ABC):
     """
 
     @abstractmethod
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self._join_config = _JoinConfig.construct_disabled_join_config()
 
diff --git a/torch/distributed/checkpoint/examples/async_checkpointing_example.py b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
index 5eaba9a6722..589f9b93544 100644
--- a/torch/distributed/checkpoint/examples/async_checkpointing_example.py
+++ b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
@@ -31,7 +31,7 @@ class InjectedException(Exception):
 
 
 class Model(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.net1 = nn.Linear(8, 32)
         self.net2 = nn.Linear(32, 128)
diff --git a/torch/distributed/checkpoint/examples/stateful_example.py b/torch/distributed/checkpoint/examples/stateful_example.py
index 6c76ec43636..f6e0d11801d 100644
--- a/torch/distributed/checkpoint/examples/stateful_example.py
+++ b/torch/distributed/checkpoint/examples/stateful_example.py
@@ -22,7 +22,7 @@ CHECKPOINT_DIR = f"~/{os.environ['LOGNAME']}/checkpoint"
 
 
 class Model(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         torch.manual_seed(0)
         self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 161eade7af6..a78ab550575 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -434,7 +434,7 @@ class _reduce_op:
     :class:`~torch.distributed.ReduceOp` is recommended to use instead.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         # __members__ is a dict storing key-value pairs for enum classes
         for k, v in ReduceOp.RedOpType.__members__.items():
             setattr(self, k, v)
@@ -568,7 +568,7 @@ class _World:
        of c10d and is subject to change..
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._default_pg = None
         self._pg_coalesce_state: Dict[ProcessGroup, List[_CollOp]] = {}
         self._pg_default_device: Dict[ProcessGroup, torch.device] = {}
@@ -2194,7 +2194,7 @@ class _IllegalWork(Work):
 
 
 class _CoalescingManager:
-    def __init__(self):
+    def __init__(self) -> None:
         self.works: List[Work] = []
 
     def append(self, work: Work):
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 10d0f821265..d722d5b9825 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -106,7 +106,7 @@ class _FSDPDeviceHandle:
 
 
 class _UninitializedDeviceHandle(_FSDPDeviceHandle):
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def __getattribute__(self, __name: str) -> Any:
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 5583da8c3e8..4e18fe3245e 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -156,7 +156,7 @@ class _RemoteModule(nn.Module):
         created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``).
         Hybrid Example:
                 >>> class HybridModel(nn.Module):
-                >>>     def __init__(self):
+                >>>     def __init__(self) -> None:
                 >>>         nn.Module.__init__(self)
                 >>>         self.remote_embedding = RemoteModule(...)
                 >>>         self.local_linear = nn.Linear(...)
diff --git a/torch/export/graph_signature.py b/torch/export/graph_signature.py
index c36941ee02e..0d93957d77c 100644
--- a/torch/export/graph_signature.py
+++ b/torch/export/graph_signature.py
@@ -248,7 +248,7 @@ class ExportGraphSignature:
     e.g. If following module is exported::
 
         class CustomModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super(CustomModule, self).__init__()
 
                 # Define a parameter
diff --git a/torch/fx/README.md b/torch/fx/README.md
index a69a6ed1f65..4c799da7bc4 100644
--- a/torch/fx/README.md
+++ b/torch/fx/README.md
@@ -45,7 +45,7 @@ FX’s front-end makes use of the dynamic nature of Python to intercept call-sit
 import torch
 
 class MyModule(torch.nn.Module):
-  def __init__(self):
+  def __init__(self) -> None:
     super().__init__()
     self.param = torch.nn.Parameter(
         torch.rand(3, 4))
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index b9896390f12..dd04cdd09d7 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -9,7 +9,7 @@ demonstration of these components in action:
     import torch
     # Simple module for demonstration
     class MyModule(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.param = torch.nn.Parameter(torch.rand(3, 4))
             self.linear = torch.nn.Linear(4, 5)
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index bd8d4f42669..92fb7b94948 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -1012,7 +1012,7 @@ class _PatchedFnSetAttr(_PatchedFn):
 
 
 class _Patcher:
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.patches_made: List[_PatchedFn] = []
         self.visited: Set[int] = set()
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint.py b/torch/fx/experimental/migrate_gradual_types/constraint.py
index 45038837cae..4693a62de24 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint.py
@@ -63,7 +63,7 @@ class T(Constraint):
     """
     True
     """
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def __eq__(self, other):
@@ -76,7 +76,7 @@ class F(Constraint):
     """
     False
     """
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def __eq__(self, other):
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index 726ab04539d..a577cf8736e 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -117,7 +117,7 @@ if HAS_PYDOT:
                 >>> # xdoctest: +REQUIRES(module:ubelt)
                 >>> # define module
                 >>> class MyModule(torch.nn.Module):
-                >>>     def __init__(self):
+                >>>     def __init__(self) -> None:
                 >>>         super().__init__()
                 >>>         self.linear = torch.nn.Linear(4, 5)
                 >>>     def forward(self, x):
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index fba516d74be..5984587f17c 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -83,7 +83,7 @@ def split_module(
             from torch.fx.passes.split_module import split_module
 
             class MyModule(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.param = torch.nn.Parameter(torch.rand(3, 4))
                     self.linear = torch.nn.Linear(4, 5)
diff --git a/torch/fx/passes/split_utils.py b/torch/fx/passes/split_utils.py
index d8254bd474b..44b97471332 100644
--- a/torch/fx/passes/split_utils.py
+++ b/torch/fx/passes/split_utils.py
@@ -83,7 +83,7 @@ def split_by_tags(
     Given the following module def:
 
     class SimpleModule(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.linear1 = torch.nn.Linear(...)
             self.linear2 = torch.nn.Linear(...)
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 874ac51afff..05157f9ddb1 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -38,7 +38,7 @@ class Scope:
                 return x.transpose(1, 2)
 
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 self.sub = Sub()
 
             def forward(self, x):
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index 419337a1768..8a9e78c0af4 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -118,7 +118,7 @@ def replace_pattern(
         from torch.fx import symbolic_trace, subgraph_rewriter
 
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
 
             def forward(self, x, w1, w2):
diff --git a/torch/fx/tensor_type.py b/torch/fx/tensor_type.py
index f59ed2d45ba..83b5a9f8faf 100644
--- a/torch/fx/tensor_type.py
+++ b/torch/fx/tensor_type.py
@@ -38,7 +38,7 @@ class _DynType:
     """
     _DynType defines a type which stands for the absence of type information.
     """
-    def __init__(self):
+    def __init__(self) -> None:
         self.__name__ = '_DynType'
 
     def __eq__(self, other):
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 6d1760fb9f4..e80fa2932fc 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -219,7 +219,7 @@ def isinstance(obj, target_type):
         from typing import Any, Dict, List
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
 
             def forward(self, input: Any): # note the Any type
@@ -255,7 +255,7 @@ class strict_fusion:
 
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         if not torch._jit_internal.is_scripting():
             warnings.warn("Only works in script mode")
         pass
diff --git a/torch/jit/_async.py b/torch/jit/_async.py
index bdde55adf14..ceaef70b1fe 100644
--- a/torch/jit/_async.py
+++ b/torch/jit/_async.py
@@ -73,7 +73,7 @@ def fork(func, *args, **kwargs):
             def forward(self, a: Tensor, b : int):
                 return a + b
         class Mod(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super(self).__init__()
                 self.mod = AddMod()
             def forward(self, input):
diff --git a/torch/jit/_check.py b/torch/jit/_check.py
index 8db5bb82ce3..f708ee87f30 100644
--- a/torch/jit/_check.py
+++ b/torch/jit/_check.py
@@ -39,7 +39,7 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor):
                 def fn(self):
                     return []
 
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.x: List[int] = []
 
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index 8f35fc471e6..e496bd74762 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -65,7 +65,7 @@ def freeze(
     .. testcode::
         import torch
         class MyModule2(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.modified_tensor = torch.tensor(10.)
                 self.version = 1
diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index ecf7cd865fd..366a58ac6af 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -89,7 +89,7 @@ if _IS_MONKEYTYPE_INSTALLED:
             self.traces.append(trace)
 
     class JitTypeTraceStore(CallTraceStore):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             # A dictionary keeping all collected CallTrace
             # key is fully qualified name of called function
@@ -159,15 +159,15 @@ else:
     # When MonkeyType is not installed, we provide dummy class definitions
     # for the below classes.
     class JitTypeTraceStoreLogger:  # type:  ignore[no-redef]
-        def __init__(self):
+        def __init__(self) -> None:
             pass
 
     class JitTypeTraceStore:  # type:  ignore[no-redef]
-        def __init__(self):
+        def __init__(self) -> None:
             self.trace_records = None
 
     class JitTypeTraceConfig:  # type:  ignore[no-redef]
-        def __init__(self):
+        def __init__(self) -> None:
             pass
 
     monkeytype_trace = None  # type: ignore[assignment]  # noqa: F811
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index b8dc0ecf2cd..e03540a7c75 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -426,7 +426,7 @@ class ConcreteTypeStore:
     type_store: Dict[Type[Module], List[torch._C.ConcreteModuleType]]
     methods_compiled: Set[torch._C.ConcreteModuleType]
 
-    def __init__(self):
+    def __init__(self) -> None:
         # Python module type => List[ConcreteModuleType)]
         self.type_store = {}
         # ConcreteTypes that have had their methods already compiled
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 490e9e644e2..a7bc45fa5fc 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -107,7 +107,7 @@ Attribute.__doc__ = """
         from typing import Dict
 
         class AttributeModule(torch.jit.ScriptModule):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.foo = torch.jit.Attribute(0.1, float)
 
@@ -138,7 +138,7 @@ Attribute.__doc__ = """
         class AttributeModule(torch.nn.Module):
             names: Dict[str, int]
 
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.names = {}
 
@@ -522,7 +522,7 @@ if _enabled:
             "original_name",
         ]
 
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
 
         forward: Callable[..., Any] = _CachedForward()  # type: ignore[assignment]
@@ -1351,7 +1351,7 @@ def script(
             import torch.nn.functional as F
 
             class MyModule(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     # torch.jit.trace produces a ScriptModule's conv1 and conv2
                     self.conv1 = torch.jit.trace(nn.Conv2d(1, 20, 5), torch.rand(1, 1, 16, 16))
@@ -1374,7 +1374,7 @@ def script(
             import torch.nn as nn
 
             class MyModule(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
 
                 @torch.jit.export
@@ -1547,7 +1547,7 @@ def interface(obj):
                 return x.relu()
 
         class Impl2(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.val = torch.rand(())
 
@@ -1671,7 +1671,7 @@ class _ScriptProfileTable:
 
 
 class _ScriptProfile:
-    def __init__(self):
+    def __init__(self) -> None:
         self.profile = classes.profiling._ScriptProfile()
 
     def enable(self):
diff --git a/torch/jit/_state.py b/torch/jit/_state.py
index 63df2acfdf0..18456ebd386 100644
--- a/torch/jit/_state.py
+++ b/torch/jit/_state.py
@@ -19,7 +19,7 @@ class EnabledProxy:
     This is just a wrapper for a bool, so that we get reference semantics
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.enabled = self.parse_env(
             "PYTORCH_JIT", True, "> Using PyTorch JIT", "> PyTorch JIT DISABLED"
         )
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 1c0372c7281..372fd72ddb2 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -966,7 +966,7 @@ def trace(
         import torch.nn as nn
 
         class Net(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
@@ -1182,7 +1182,7 @@ def trace_module(
         import torch.nn as nn
 
         class Net(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index 0a6d3c8a444..fa0818571a9 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -61,7 +61,7 @@ class StorageWeakRef:
 class SharedCache(dict):
     """Dictionary from multiprocessing handles to StorageWeakRef."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         # free_dead_references() is called if the len exceeds the current
         # limit. The limit scales with the number of remaining live objects.
         self.limit = 128
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 30992e394b0..585f4ef1658 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -291,7 +291,7 @@ class ModuleList(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
 
@@ -465,7 +465,7 @@ class ModuleDict(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.choices = nn.ModuleDict({
                         'conv': nn.Conv2d(10, 10, 3),
@@ -597,7 +597,7 @@ class ParameterList(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
 
@@ -749,7 +749,7 @@ class ParameterDict(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.params = nn.ParameterDict({
                         'left': nn.Parameter(torch.randn(5, 10)),
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index 7a9a0161006..61cabd061ae 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -86,7 +86,7 @@ class LazyModuleMixin:
 
     >>> # xdoctest: +SKIP
     >>> class LazyMLP(torch.nn.Module):
-    ...    def __init__(self):
+    ...    def __init__(self) -> None:
     ...        super().__init__()
     ...        self.fc1 = torch.nn.LazyLinear(10)
     ...        self.relu1 = torch.nn.ReLU()
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index a6592655fd4..a15850553f1 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -408,7 +408,7 @@ class Module:
         import torch.nn.functional as F
 
         class Model(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.conv1 = nn.Conv2d(1, 20, 5)
                 self.conv2 = nn.Conv2d(20, 20, 5)
diff --git a/torch/onnx/_globals.py b/torch/onnx/_globals.py
index 22c05075dba..ebef6c331b2 100644
--- a/torch/onnx/_globals.py
+++ b/torch/onnx/_globals.py
@@ -20,7 +20,7 @@ class _InternalGlobals:
     global variables unless they are absolutely necessary.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._export_onnx_opset_version = _constants.ONNX_DEFAULT_OPSET
         self._training_mode: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
         self._in_onnx_export: bool = False
diff --git a/torch/onnx/_internal/exporter.py b/torch/onnx/_internal/exporter.py
index e53f906cd84..7c7203c8085 100644
--- a/torch/onnx/_internal/exporter.py
+++ b/torch/onnx/_internal/exporter.py
@@ -760,7 +760,7 @@ class ONNXProgram:
 
             >>> import pprint
             >>> class CustomModule(torch.nn.Module):
-            ...     def __init__(self):
+            ...     def __init__(self) -> None:
             ...         super().__init__()
             ...         self.my_parameter = torch.nn.Parameter(torch.tensor(2.0))
             ...         self.register_buffer("my_buffer1", torch.tensor(3.0))
diff --git a/torch/onnx/_internal/fx/dynamo_graph_extractor.py b/torch/onnx/_internal/fx/dynamo_graph_extractor.py
index a3b8a69f60d..5abf2bf2c63 100644
--- a/torch/onnx/_internal/fx/dynamo_graph_extractor.py
+++ b/torch/onnx/_internal/fx/dynamo_graph_extractor.py
@@ -24,7 +24,7 @@ class _PyTreeExtensionContext:
 
     _extensions: dict[type, tuple[pytree.FlattenFunc, pytree.UnflattenFunc]]
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._extensions = {}
         # Register PyTree extension for HuggingFace model output.
         self._register_huggingface_model_output_extension()
diff --git a/torch/onnx/_internal/fx/passes/modularization.py b/torch/onnx/_internal/fx/passes/modularization.py
index 4f4d347401e..db74d52dda4 100644
--- a/torch/onnx/_internal/fx/passes/modularization.py
+++ b/torch/onnx/_internal/fx/passes/modularization.py
@@ -795,7 +795,7 @@ class Modularize(_pass.Transform):
         >>> from torch.onnx._internal.diagnostics import infra
         >>>
         >>> class CustomModule(torch.nn.Module):
-        >>>     def __init__(self):
+        >>>     def __init__(self) -> None:
         >>>         super().__init__()
         >>>         self.embedding = torch.nn.Embedding(10, 32)
         >>>         self.relu = torch.nn.ReLU()
@@ -806,7 +806,7 @@ class Modularize(_pass.Transform):
         >>>         return out
         >>>
         >>> class TestModule(torch.nn.Module):
-        >>>     def __init__(self):
+        >>>     def __init__(self) -> None:
         >>>         super().__init__()
         >>>         self.layer = CustomModule()
         >>>         self.linear = torch.nn.Linear(32, 10)
diff --git a/torch/onnx/_internal/fx/patcher.py b/torch/onnx/_internal/fx/patcher.py
index 3c0ee6c0714..239edb6dde6 100644
--- a/torch/onnx/_internal/fx/patcher.py
+++ b/torch/onnx/_internal/fx/patcher.py
@@ -53,7 +53,7 @@ class ONNXTorchPatcher:
         `torch.fx._symbolic_trace._wrapped_methods_to_patch`
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         # List of file paths processed by torch.load.
         self.paths: List[Union[str, io.BufferedIOBase]] = []
 
diff --git a/torch/onnx/_internal/onnxruntime.py b/torch/onnx/_internal/onnxruntime.py
index b9d6bce1651..59609866bae 100644
--- a/torch/onnx/_internal/onnxruntime.py
+++ b/torch/onnx/_internal/onnxruntime.py
@@ -602,7 +602,7 @@ class OrtExecutionInfoPerSession:
 
 @dataclasses.dataclass
 class OrtExecutionInfoForAllGraphModules:
-    def __init__(self):
+    def __init__(self) -> None:
         # All sessions (and their related information) created by exporting the same GraphModule
         # with different inputs.
         self.execution_info_per_graph_module: Dict[
diff --git a/torch/onnx/_internal/registration.py b/torch/onnx/_internal/registration.py
index c59ab11d4fa..95de41b3f03 100644
--- a/torch/onnx/_internal/registration.py
+++ b/torch/onnx/_internal/registration.py
@@ -69,7 +69,7 @@ class OverrideDict(Collection[_K], Generic[_K, _V]):
     ones.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self._base: Dict[_K, _V] = {}
         self._overrides: Dict[_K, _V] = {}
         self._merged: Dict[_K, _V] = {}
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index e8bcfe4ca9e..bcf1de6b643 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -1722,7 +1722,7 @@ def find_mismatch(
         ...     opset_version=opset_version,
         ... )
         >>> class Model(torch.nn.Module):
-        ...     def __init__(self):
+        ...     def __init__(self) -> None:
         ...         super().__init__()
         ...         self.layers = torch.nn.Sequential(
         ...             torch.nn.Linear(3, 4),
diff --git a/torch/overrides.py b/torch/overrides.py
index bbd055de447..ecb5613f80f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -2025,7 +2025,7 @@ class TorchFunctionMode:
     inner: "TorchFunctionMode"
 
     # Force metaclass to generate constructor at the base of the hierarchy
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
diff --git a/torch/package/_mangling.py b/torch/package/_mangling.py
index 7dcf3538631..700a9ad6a04 100644
--- a/torch/package/_mangling.py
+++ b/torch/package/_mangling.py
@@ -12,7 +12,7 @@ class PackageMangler:
     Used on import, to ensure that all modules imported have a shared mangle parent.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         global _mangle_index
         self._mangle_index = _mangle_index
         # Increment the global index
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 47faac8c176..98f1c1b6735 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -772,7 +772,7 @@ class ExecutionTraceObserver(_ITraceObserver):
     incurring any overheads.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         """
         Initializes the default states.
         """
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 51a3deac9c1..fe02eeeabb1 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1498,7 +1498,7 @@ def test_compiled_fsdp(compile_compute_on_module: Optional[type] = None):
 
 
 class SkipModule(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.lin = nn.Linear(10, 10, bias=False)
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4011f6752de..8f63db32a07 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -3591,7 +3591,7 @@ def error_inputs_adaptive_max_pool3d(opinfo, device, **kwargs):
 
 class _TestParamsMaxPoolBase:
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.kwargs = {
             'kernel_size': [3],
             'stride': [2, None],
@@ -3628,7 +3628,7 @@ class _TestParamsMaxPoolBase:
 
 class _TestParamsMaxPool1d(_TestParamsMaxPoolBase):
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.kwargs['kernel_size'] += [(3,)]
         self.kwargs['stride'] += [(2,)]
@@ -3637,7 +3637,7 @@ class _TestParamsMaxPool1d(_TestParamsMaxPoolBase):
 
 class _TestParamsMaxPool2d(_TestParamsMaxPoolBase):
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.kwargs['kernel_size'] += [(3, 2)]
         self.kwargs['stride'] += [(2, 1)]
@@ -3648,7 +3648,7 @@ class _TestParamsMaxPool2d(_TestParamsMaxPoolBase):
 
 class _TestParamsMaxPool3d(_TestParamsMaxPoolBase):
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.kwargs['kernel_size'] += [(3, 2, 3)]
         self.kwargs['stride'] += [(2, 1, 2)]
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 0dd11312c04..7f53a72a7ef 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -3967,13 +3967,13 @@ def _test_module_empty_input(test_case, module, inp, check_size=True, inference=
 
 def _create_basic_net():
     class Layer(nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.layer_dummy_param = nn.Parameter(torch.empty(3, 5))
             self.layer_dummy_buf = nn.Buffer(torch.zeros(1, 3, 3, 7))
 
     class Net(nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.l1 = Layer()
             self.dummy_param = nn.Parameter(torch.empty(3, 5))
diff --git a/torch/testing/_internal/common_pruning.py b/torch/testing/_internal/common_pruning.py
index 031e4ad9efb..43dd716c288 100644
--- a/torch/testing/_internal/common_pruning.py
+++ b/torch/testing/_internal/common_pruning.py
@@ -52,7 +52,7 @@ class SimpleLinear(nn.Module):
     r"""Model with only Linear layers without biases, some wrapped in a Sequential,
     some following the Sequential. Used to test basic pruned Linear-Linear fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Linear(7, 5, bias=False),
@@ -73,7 +73,7 @@ class LinearBias(nn.Module):
     r"""Model with only Linear layers, alternating layers with biases,
     wrapped in a Sequential. Used to test pruned Linear-Bias-Linear fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Linear(7, 5, bias=True),
@@ -93,7 +93,7 @@ class LinearActivation(nn.Module):
     Activation functions modules in between each Linear in the Sequential, and each outside layer.
     Used to test pruned Linear(Bias)-Activation-Linear fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Linear(7, 5, bias=True),
@@ -122,7 +122,7 @@ class LinearActivationFunctional(nn.Module):
     activationals are called in between each outside layer.
     Used to test pruned Linear(Bias)-Activation-Linear fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Linear(7, 5, bias=True),
@@ -151,7 +151,7 @@ class SimpleConv2d(nn.Module):
     r"""Model with only Conv2d layers, all without bias, some in a Sequential and some following.
     Used to test pruned Conv2d-Conv2d fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 32, 3, 1, bias=False),
@@ -171,7 +171,7 @@ class Conv2dBias(nn.Module):
     r"""Model with only Conv2d layers, some with bias, some in a Sequential and some outside.
     Used to test pruned Conv2d-Bias-Conv2d fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 32, 3, 1, bias=True),
@@ -194,7 +194,7 @@ class Conv2dActivation(nn.Module):
     in-between each outside layer.
     Used to test pruned Conv2d-Bias-Activation-Conv2d fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 32, 3, 1, bias=True),
@@ -222,7 +222,7 @@ class Conv2dPadBias(nn.Module):
     Used to test that bias is propagated correctly in the special case of
     pruned Conv2d-Bias-(Activation)Conv2d fusion, when the second Conv2d layer has padding > 0."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 32, 3, 1, padding=1, bias=True),
@@ -255,7 +255,7 @@ class Conv2dPool(nn.Module):
     Activation function modules in between each layer, Pool2d modules in between each layer.
     Used to test pruned Conv2d-Pool2d-Conv2d fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 32, kernel_size=3, padding=1, bias=True),
@@ -289,7 +289,7 @@ class Conv2dPoolFlattenFunctional(nn.Module):
     Activation functions and Pool2ds in between each layer also.
     Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
@@ -323,7 +323,7 @@ class Conv2dPoolFlatten(nn.Module):
     Activation functions and Pool2ds in between each layer also.
     Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.seq = nn.Sequential(
             nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 553d483ab0a..2482629fe99 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -1326,7 +1326,7 @@ class PT2EQuantizationTestCase(QuantizationTestCase):
 
     def _get_pt2e_quantized_linear(self, is_per_channel=False) -> torch.fx.GraphModule:
         class M(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(2, 2)
 
@@ -1343,7 +1343,7 @@ class PT2EQuantizationTestCase(QuantizationTestCase):
 # Below are a series of toy models to use in testing quantization
 
 class SingleLayerLinearModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
 
@@ -1381,7 +1381,7 @@ class SingleLayerLinearDynamicModel(torch.nn.Module):
         return (torch.rand(1, 5),)
 
 class LinearAddModel(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
@@ -1436,7 +1436,7 @@ class LSTMwithHiddenDynamicModel(torch.nn.Module):
         return x, hid
 
 class ConvModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
 
@@ -1448,7 +1448,7 @@ class ConvModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class ConvTransposeModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
 
@@ -1494,7 +1494,7 @@ class AnnotatedConvTransposeModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class ConvBnModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
         self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
@@ -1508,7 +1508,7 @@ class ConvBnModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class AnnotatedConvBnModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.qconfig = default_qconfig
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
@@ -1527,7 +1527,7 @@ class AnnotatedConvBnModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class ConvBnReLUModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
         self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
@@ -1571,7 +1571,7 @@ class AnnotatedConvBnReLUModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class TwoLayerConvModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
         self.conv2 = torch.nn.Conv2d(5, 5, 1, bias=False).to(dtype=torch.float)
@@ -1585,7 +1585,7 @@ class TwoLayerConvModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class TwoLayerLinearModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
@@ -1599,7 +1599,7 @@ class TwoLayerLinearModel(torch.nn.Module):
         return (torch.rand(1, 5),)
 
 class LinearModelWithSubmodule(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.subm = TwoLayerLinearModel()
         self.fc = nn.Linear(5, 5)
@@ -1613,7 +1613,7 @@ class LinearModelWithSubmodule(nn.Module):
         return self.subm.get_example_inputs()
 
 class AnnotatedTwoLayerLinearModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.fc2 = QuantWrapper(torch.nn.Linear(8, 5).to(dtype=torch.float))
@@ -1628,7 +1628,7 @@ class AnnotatedTwoLayerLinearModel(torch.nn.Module):
         return (torch.rand(1, 5),)
 
 class ActivationsTestModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
         self.quant = torch.ao.quantization.QuantStub()
@@ -1644,7 +1644,7 @@ class ActivationsTestModel(torch.nn.Module):
         return x
 
 class LinearReluModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
         self.relu = torch.nn.ReLU()
@@ -1658,7 +1658,7 @@ class LinearReluModel(torch.nn.Module):
 
 
 class LinearReluLinearModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.relu = torch.nn.ReLU()
@@ -1674,7 +1674,7 @@ class LinearReluLinearModel(torch.nn.Module):
         return (torch.rand(1, 5),)
 
 class LinearReluAddModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
         self.relu = torch.nn.ReLU()
@@ -1710,7 +1710,7 @@ class LinearBnLeakyReluModel(torch.nn.Module):
         return (torch.rand(1, 5),)
 
 class LinearTanhModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear = nn.Linear(5, 5)
         self.tanh = nn.Tanh()
@@ -1785,7 +1785,7 @@ class ConvBnAddReluModel(torch.nn.Module):
 
 # TODO: self.fc should be self.conv
 class ConvReluModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc = torch.nn.Conv2d(3, 5, 3).to(dtype=torch.float)
         self.relu = torch.nn.ReLU()
@@ -1799,7 +1799,7 @@ class ConvReluModel(torch.nn.Module):
 
 # TODO: self.fc should be self.conv
 class ConvReluConvModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Conv2d(3, 5, 3).to(dtype=torch.float)
         self.relu = torch.nn.ReLU()
@@ -1816,7 +1816,7 @@ class ConvReluConvModel(torch.nn.Module):
 
 # TODO: self.fc should be self.conv
 class ConvReluAddModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Conv2d(3, 5, 3).to(dtype=torch.float)
         self.relu = torch.nn.ReLU()
@@ -1834,7 +1834,7 @@ class ConvReluAddModel(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class NormalizationTestModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.quant = torch.ao.quantization.QuantStub()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
@@ -1855,7 +1855,7 @@ class NormalizationTestModel(torch.nn.Module):
         return x
 
 class NestedModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.sub1 = LinearReluModel()
         self.sub2 = TwoLayerLinearModel()
@@ -1887,7 +1887,7 @@ class AnnotatedNestedModel(torch.nn.Module):
         return x
 
 class AnnotatedSubNestedModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.sub1 = LinearReluModel()
         self.sub2 = QuantWrapper(TwoLayerLinearModel())
@@ -1902,7 +1902,7 @@ class AnnotatedSubNestedModel(torch.nn.Module):
         return x
 
 class AnnotatedCustomConfigNestedModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.sub1 = LinearReluModel()
         self.sub2 = TwoLayerLinearModel()
@@ -1928,7 +1928,7 @@ class AnnotatedCustomConfigNestedModel(torch.nn.Module):
         return x
 
 class QuantSubModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.sub1 = LinearReluModel()
         self.sub2 = QuantWrapper(TwoLayerLinearModel())
@@ -1943,7 +1943,7 @@ class QuantSubModel(torch.nn.Module):
         return x
 
 class InnerModule(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.relu1 = torch.nn.ReLU()
@@ -1970,7 +1970,7 @@ class InnerModule(torch.nn.Module):
             torch.ao.quantization.fuse_modules(self, fusable_layers, inplace=True)
 
 class FunctionalLinear(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.weight = torch.rand((5, 5))
         self.bias = torch.zeros(5)
@@ -1982,7 +1982,7 @@ class FunctionalLinear(torch.nn.Module):
         return (torch.rand(1, 5),)
 
 class SingleLayerFunctionalLinearModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear1 = FunctionalLinear()
 
@@ -1994,7 +1994,7 @@ class SingleLayerFunctionalLinearModel(torch.nn.Module):
         return self.linear1.get_example_inputs()
 
 class TwoLayerFunctionalLinearModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear1 = FunctionalLinear()
         self.linear2 = FunctionalLinear()
@@ -2008,7 +2008,7 @@ class TwoLayerFunctionalLinearModel(torch.nn.Module):
         return self.linear1.get_example_inputs()
 
 class FunctionalLinearAddModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear1 = FunctionalLinear()
         self.linear2 = FunctionalLinear()
@@ -2023,7 +2023,7 @@ class FunctionalLinearAddModel(torch.nn.Module):
         return self.linear1.get_example_inputs()
 
 class FunctionalLinearReluModel(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear = FunctionalLinear()
 
@@ -2036,7 +2036,7 @@ class FunctionalLinearReluModel(nn.Module):
         return self.linear.get_example_inputs()
 
 class FunctionalLinearReluLinearModel(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear1 = FunctionalLinear()
         self.relu = nn.ReLU()
@@ -2052,7 +2052,7 @@ class FunctionalLinearReluLinearModel(nn.Module):
         return self.linear1.get_example_inputs()
 
 class FunctionalConv2d(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.weight = torch.rand(3, 3, 3, 3)
         self.bias = torch.rand(3)
@@ -2068,7 +2068,7 @@ class FunctionalConv2d(torch.nn.Module):
         return (torch.rand(1, 3, 5, 5),)
 
 class SingleLayerFunctionalConvModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = FunctionalConv2d()
 
@@ -2080,7 +2080,7 @@ class SingleLayerFunctionalConvModel(torch.nn.Module):
         return self.conv1.get_example_inputs()
 
 class TwoLayerFunctionalConvModel(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = FunctionalConv2d()
         self.conv2 = FunctionalConv2d()
@@ -2094,7 +2094,7 @@ class TwoLayerFunctionalConvModel(torch.nn.Module):
         return self.conv1.get_example_inputs()
 
 class FunctionalConvReluModel(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = FunctionalConv2d()
 
@@ -2107,7 +2107,7 @@ class FunctionalConvReluModel(nn.Module):
         return self.conv.get_example_inputs()
 
 class FunctionalConvReluConvModel(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = FunctionalConv2d()
         self.relu = nn.ReLU()
@@ -2126,7 +2126,7 @@ class SkipQuantModel(torch.nn.Module):
     r"""We can skip quantization by explicitly
     setting qconfig of a submodule to None
     """
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.sub = InnerModule()
         self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
@@ -2158,7 +2158,7 @@ class AnnotatedSkipQuantModel(torch.nn.Module):
 class QuantStubModel(torch.nn.Module):
     r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
     """
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
         self.quant = QuantStub()
@@ -2243,11 +2243,11 @@ class ManualConvLinearSymmQATModel(ManualConvLinearQATModel):
     r"""Same as ManualConvLinearQATModule but with Symmetric Quantization.
     Supported only with qnnpack.
     """
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(default_symmetric_qnnpack_qat_qconfig)
 
 class ManualEmbeddingBagLinear(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.emb = nn.EmbeddingBag(num_embeddings=10, embedding_dim=12, mode='sum')
         self.emb.qconfig = default_embedding_qat_qconfig
@@ -2287,7 +2287,7 @@ class DeFusedEmbeddingBagLinear(nn.Module):
         return self.dequant(x)
 
 class SubModelForFusion(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = nn.Conv2d(2, 2, 1, bias=None).to(dtype=torch.float)
         self.bn = nn.BatchNorm2d(2).to(dtype=torch.float)
@@ -2299,7 +2299,7 @@ class SubModelForFusion(nn.Module):
 
 
 class SubModelWithoutFusion(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv = nn.Conv2d(2, 2, 1, bias=None).to(dtype=torch.float)
         self.relu = nn.ReLU(inplace=False).to(dtype=torch.float)
@@ -2354,7 +2354,7 @@ class ModelForFusion(nn.Module):
         return x
 
 class ConvBNReLU(nn.Sequential):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(
             nn.Conv2d(3, 3, 1, 1, bias=False),
             nn.BatchNorm2d(3),
@@ -2362,7 +2362,7 @@ class ConvBNReLU(nn.Sequential):
         )
 
 class ModelWithSequentialFusion(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = nn.Conv2d(3, 3, 1)
         self.relu1 = nn.ReLU(inplace=False)
@@ -2388,7 +2388,7 @@ class ModelWithSequentialFusion(nn.Module):
         return x
 
 class ModelForFusionWithBias(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = nn.Conv2d(3, 2, 5, bias=True).to(dtype=torch.float)
         self.bn1 = nn.BatchNorm2d(2).to(dtype=torch.float)
@@ -2409,7 +2409,7 @@ class ModelForFusionWithBias(nn.Module):
         return x
 
 class ModelForLinearBNFusion(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc = nn.Linear(20, 10)
         self.bn = nn.BatchNorm1d(10)
@@ -2428,7 +2428,7 @@ class DummyObserver(torch.nn.Module):
 
 
 class ModelForConvTransposeBNFusion(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.conv1 = nn.ConvTranspose1d(3, 3, 1)
         self.bn1 = nn.BatchNorm1d(3)
@@ -2450,7 +2450,7 @@ class ModelForConvTransposeBNFusion(nn.Module):
 
 
 class ModelWithFunctionals(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.mycat = nnq.FloatFunctional()
         self.myadd = nnq.FloatFunctional()
@@ -2474,7 +2474,7 @@ class ModelWithFunctionals(torch.nn.Module):
 
 
 class ResNetBase(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         norm_layer = nn.BatchNorm2d
         inplanes = 3
@@ -2507,7 +2507,7 @@ class ResNetBase(torch.nn.Module):
             torch.ao.quantization.fuse_modules(self, [['conv1', 'bn1', 'relu1']], inplace=True)
 
 class ModelMultipleOps(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         norm_layer = nn.BatchNorm2d
         inplanes = 3
@@ -2542,7 +2542,7 @@ class ModelMultipleOps(torch.nn.Module):
 # accurately with fake-quant so this model does not
 # contain those operations
 class ModelMultipleOpsNoAvgPool(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         norm_layer = nn.BatchNorm2d
         inplanes = 3
@@ -2572,7 +2572,7 @@ class ModelMultipleOpsNoAvgPool(torch.nn.Module):
         return out
 
 class EmbeddingBagModule(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
                                          include_last_offset=True, scale_grad_by_freq=False, mode='sum')
@@ -2581,7 +2581,7 @@ class EmbeddingBagModule(torch.nn.Module):
         return self.emb(indices, offsets, per_sample_weights)
 
 class EmbeddingModule(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=12)
 
@@ -2589,7 +2589,7 @@ class EmbeddingModule(torch.nn.Module):
         return self.emb(indices)
 
 class EmbeddingWithStaticLinear(torch.nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12)
         self.fc = torch.nn.Linear(4, 2)
@@ -2671,7 +2671,7 @@ class SparseNNModel(nn.Module):
 
 class TestHelperModules:
     class Conv2dPropAnnotaton(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(3, 3, 3)
             self.linear = torch.nn.Linear(3, 3)
@@ -2684,7 +2684,7 @@ class TestHelperModules:
             return x
 
     class Conv2dWithObsSharingOps(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(3, 3, 3)
             self.hardtanh = torch.nn.Hardtanh()
@@ -2698,7 +2698,7 @@ class TestHelperModules:
             return x
 
     class Conv2dWithTwoLinearPermute(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(3, 16, 3)
             self.linear1 = torch.nn.Linear(16, 8, bias=False)
@@ -2710,7 +2710,7 @@ class TestHelperModules:
             return self.linear2(self.linear1(permute_out))
 
     class Conv2dWithTwoLinear(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(3, 16, 3)
             self.linear1 = torch.nn.Linear(64, 8, bias=False)
@@ -2722,7 +2722,7 @@ class TestHelperModules:
             return self.linear2(self.linear1(reshape_out))
 
     class ConvLinearWPermute(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(3, 8, 3)
             self.linear1 = torch.nn.Linear(8, 8)
@@ -2733,7 +2733,7 @@ class TestHelperModules:
             return self.linear1(permute_out)
 
     class TwoLinearModule(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.linear1 = torch.nn.Linear(8, 16, bias=False)
             self.linear2 = torch.nn.Linear(16, 8)
@@ -2742,7 +2742,7 @@ class TestHelperModules:
             return self.linear2(self.linear1(x))
 
     class ConvMaxPool2d(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(2, 2, 1)
             self.pool = torch.nn.MaxPool2d(1, 1)
@@ -2753,7 +2753,7 @@ class TestHelperModules:
             return x
 
     class ConvWithAdaptiveAvgPool2d(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(3, 3, 3)
             self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d((1, 1))
@@ -2806,7 +2806,7 @@ class TestHelperModules:
             return self.relu(x)
 
     class Conv2dThenConv1d(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv1d = torch.nn.Conv1d(3, 3, 3)
             self.conv2d = torch.nn.Conv2d(3, 3, 3)
@@ -2821,7 +2821,7 @@ class TestHelperModules:
             return (torch.randn(1, 3, 5, 5),)
 
     class Conv2dWithCat(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv1 = torch.nn.Conv2d(3, 3, 3)
             self.conv2 = torch.nn.Conv2d(3, 3, 3)
@@ -2833,7 +2833,7 @@ class TestHelperModules:
             return z
 
     class Conv2dWithTwoCat(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv1 = torch.nn.Conv2d(3, 3, 3)
             self.conv2 = torch.nn.Conv2d(3, 3, 3)
@@ -2854,7 +2854,7 @@ class TestHelperModules:
             return w
 
     class EmbeddingModule(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=12)
 
@@ -2862,7 +2862,7 @@ class TestHelperModules:
             return self.emb(indices)
 
     class EmbeddingConvLinearModule(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=8)
             self.conv = torch.nn.Conv2d(8, 16, (1, 3))
@@ -2898,7 +2898,7 @@ class TestHelperModules:
             return x
 
     class ConvBnReLU2dAndLinearReLU(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv_bn_relu = TestHelperModules.ConvWithBNRelu(relu=True)
             self.linear = torch.nn.Linear(3, 8, bias=False)
@@ -2911,7 +2911,7 @@ class TestHelperModules:
             return linear_out
 
     class GroupwiseConv2d(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(4, 4, 3, groups=2)
 
@@ -2922,7 +2922,7 @@ class TestHelperModules:
             return (torch.randn(2, 4, 10, 10),)
 
     class LinearReluModel(torch.nn.Module):
-        def __init__(self):
+        def __init__(self) -> None:
             super().__init__()
             self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
             self.relu = torch.nn.ReLU()
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 12723039358..8ec568c665c 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1092,7 +1092,7 @@ def sanitize_pytest_xml(xml_file: str):
 
 def get_pytest_test_cases(argv: List[str]) -> List[str]:
     class TestCollectorPlugin:
-        def __init__(self):
+        def __init__(self) -> None:
             self.tests = []
 
         def pytest_collection_finish(self, session):
diff --git a/torch/testing/_internal/data/network1.py b/torch/testing/_internal/data/network1.py
index e6180f4f2d2..8755643a78c 100644
--- a/torch/testing/_internal/data/network1.py
+++ b/torch/testing/_internal/data/network1.py
@@ -5,6 +5,6 @@ import torch.nn as nn
 
 class Net(nn.Module):
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear = nn.Linear(10, 20)
diff --git a/torch/testing/_internal/data/network2.py b/torch/testing/_internal/data/network2.py
index fdb583d0af9..19b0b8ee53d 100644
--- a/torch/testing/_internal/data/network2.py
+++ b/torch/testing/_internal/data/network2.py
@@ -5,7 +5,7 @@ import torch.nn as nn
 
 class Net(nn.Module):
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.linear = nn.Linear(10, 20)
         self.relu = nn.ReLU()
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 276dc4fa6e7..a8e1434ecdb 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -107,7 +107,7 @@ else:
 
 
 class NetWithBuffers(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.a = nn.Linear(10, 10, bias=False)
         self.b = nn.Linear(10, 1, bias=False)
@@ -260,7 +260,7 @@ class DDPUnevenTestInput(NamedTuple):
 
 
 class _FC2(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc = nn.Linear(10, 50, bias=True)
         self.fc.bias.requires_grad = False
@@ -271,7 +271,7 @@ class _FC2(nn.Module):
 
 
 class Net(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False)
         self.fc2 = _FC2()
@@ -289,7 +289,7 @@ class Net(nn.Module):
 
 
 class LargeNet(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.fc1 = nn.Linear(1000, 2000, bias=False)
         self.fc2 = nn.Linear(2000, 500, bias=False)
@@ -301,7 +301,7 @@ class LargeNet(nn.Module):
 
 
 class Task(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.p = nn.Parameter(torch.ones(2, 2))
 
@@ -325,7 +325,7 @@ class BatchNormNet(nn.Module):
 
 
 class UnusedParamTwoLinLayerNet(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.a = nn.Linear(10, 10, bias=False)
         self.b = nn.Linear(10, 10, bias=False)
@@ -338,7 +338,7 @@ class UnusedParamTwoLinLayerNet(nn.Module):
 
 
 class DictOutputModule(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.module = UnusedParamTwoLinLayerNet()
 
@@ -352,7 +352,7 @@ class DictOutputModule(nn.Module):
 
 
 class TwoLinLayerNet(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.a = nn.Linear(10, 10, bias=False)
         self.b = nn.Linear(10, 1, bias=False)
@@ -383,7 +383,7 @@ class EmbeddingNetDifferentParams(nn.Module):
 
 
 class ControlFlowToyModel(nn.Module):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.lin1 = nn.Linear(10, 10, bias=False)
         self.lin2 = nn.Linear(10, 10, bias=False)
@@ -4408,7 +4408,7 @@ class DistributedTest:
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         def test_ddp_zero_output_features(self):
             class ToyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.net1 = nn.Linear(10, 10)
                     self.relu = nn.ReLU()
@@ -4422,7 +4422,7 @@ class DistributedTest:
         @skip_but_pass_in_sandcastle_if(BACKEND == "nccl", "Gloo-only test")
         def test_ddp_create_graph(self):
             class Model(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.p = nn.Parameter(torch.tensor(1.0))
 
@@ -4979,7 +4979,7 @@ class DistributedTest:
             mp_config = self._get_fp16_config()
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.m = torch.nn.Linear(1, 5)
                     self.register_buffer('buffer', torch.randn(1, 2))
@@ -7241,7 +7241,7 @@ class DistributedTest:
             # for models with SyncBN or general collective comm when
             # throw_on_early_termination=True.
             class ModelWithComm(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.lin = nn.Linear(2, 40, bias=False)
 
@@ -7523,7 +7523,7 @@ class DistributedTest:
             error_str = "Intentional error"
 
             class ExceptionModule(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.param = nn.Parameter(torch.ones(1, requires_grad=True))
 
@@ -7731,7 +7731,7 @@ class DistributedTest:
         @skip_if_lt_x_gpu(2)
         def test_ddp_unused_params_rebuild_buckets_exception(self):
             class ToyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.net1 = nn.Linear(10, 10, bias=False)
                     self.net2 = nn.Linear(10, 10, bias=False)
@@ -7785,7 +7785,7 @@ class DistributedTest:
             # When find_unused_parameters=True, ensure we mark unused parameters
             # even if they share gradient accumulators.
             class ToyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     # net1, bias, and net1.bias are all unused params.
                     self.net1 = nn.Linear(10, 5, bias=False)
@@ -8984,7 +8984,7 @@ class DistributedTest:
         @skip_if_lt_x_gpu(2)
         def test_ddp_build_debug_param_to_name_mapping_requires_grad(self):
             class Net(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.lin = nn.Linear(10, 10)
                     # Is not tracked by DDP and should not show up in param to
@@ -9009,7 +9009,7 @@ class DistributedTest:
             debug_mode_off = dist.get_debug_level() == dist.DebugLevel.OFF
 
             class SubModule(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.embedding_net = EmbeddingNetDifferentParams(0)
                     self.lin = TwoLinLayerNet()
@@ -9025,7 +9025,7 @@ class DistributedTest:
                     return x
 
             class MyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.sub_module = SubModule()
 
@@ -9261,7 +9261,7 @@ class DistributedTest:
             torch.cuda.set_device(rank)
 
             class NestedOutputModule(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.lin = nn.Linear(100, 1, bias=False)
 
@@ -9347,7 +9347,7 @@ class DistributedTest:
             torch.cuda.set_device(self.rank)
 
             class MyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.fc1 = nn.Linear(10, 10, bias=False)
                     self.fc2 = nn.Linear(10, 10, bias=False)
@@ -9384,7 +9384,7 @@ class DistributedTest:
         )
         def test_detect_ddp_is_actually_static(self):
             class ToyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.net1 = nn.Linear(10, 10, bias=False)
                     self.net2 = nn.Linear(10, 10)
@@ -9430,7 +9430,7 @@ class DistributedTest:
         def _test_ddp_new_tensor_in_fwd(self, static_graph):
             # Test from https://github.com/pytorch/pytorch/issues/60733
             class MyModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.fc1 = nn.Linear(10, 10, bias=False)
                     self.fc2 = nn.Linear(10, 10, bias=False)
@@ -9965,7 +9965,7 @@ class DistributedTest:
             torch.cuda.manual_seed(rank)
 
             class NetWithBuffers(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.a = nn.Linear(10, 10, bias=False)
                     self.b = nn.Linear(10, 1, bias=False)
@@ -10002,7 +10002,7 @@ class DistributedTest:
         )
         def test_static_graph_multi_forward(self):
             class Net(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.lin = nn.Linear(10, 10)
                     self.relu = nn.ReLU()
@@ -10084,7 +10084,7 @@ class DistributedTest:
         )
         def test_stateless_api_with_ddp(self):
             class MockModule(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.l1 = torch.nn.Linear(1, 1)
                     buffer = torch.ones(1)
@@ -10131,7 +10131,7 @@ class DistributedTest:
         @skip_if_lt_x_gpu(2)
         def test_ddp_forward_backward_hook(self):
             class DummyTestModel(nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     torch.manual_seed(0)
                     self.fc = nn.Linear(2, 2)
@@ -10391,7 +10391,7 @@ class DistributedTest:
                     return func(*args, **kwargs)
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
+                def __init__(self) -> None:
                     super().__init__()
                     self.fc = torch.nn.Linear(10, 10)
 
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index 1ea7eace829..5d7e7b1244b 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -44,7 +44,7 @@ class Policy(nn.Module):
     Copying the code to make these two examples independent.
     See https://github.com/pytorch/examples/tree/master/reinforcement_learning
     """
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.affine1 = nn.Linear(4, 128)
         self.dropout = nn.Dropout(p=0.6)
@@ -97,7 +97,7 @@ class Observer:
     select an action. Then, the observer applies the action to its environment
     and reports the reward to the agent.
     """
-    def __init__(self):
+    def __init__(self) -> None:
         self.id = rpc.get_worker_info().id
         self.env = DummyEnv()
         self.env.seed(SEED)
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 3a3977d7b89..413f97d94eb 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -144,7 +144,7 @@ def set_and_check_done(value):
 TensorClass = namedtuple("TensorClass", ["tensors"])
 
 class MyPickleClass:
-    def __init__(self):
+    def __init__(self) -> None:
         self.t = None
 
     def __getstate__(self):
@@ -1446,7 +1446,7 @@ class RpcTest(RpcAgentTestFixture, RpcTestCommon):
             world_size=self.world_size)
 
         class MyModel(torch.nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.lin = torch.nn.Linear(3, 4)
 
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index 8171a959189..02a9fcc5405 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -604,7 +604,7 @@ def create_script_module(self, nn_module, constructor_args, *args, **kwargs):
         class TheModule(torch.jit.ScriptModule):
             __constants__ = submodule_constants
 
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.submodule = nn_module(*constructor_args)
 
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index c0109ecacf7..a8c7fa261f9 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -770,7 +770,7 @@ def _get_py3_code(code, fn_name):
         return fn
 
 class TensorExprTestOptions:
-    def __init__(self):
+    def __init__(self) -> None:
         self.old_profiling_executor = torch._C._jit_set_profiling_executor(True)
         self.old_profiling_mode = torch._C._get_graph_executor_optimize(True)
 
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index 4a01d8e53b9..29ee1886261 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -936,7 +936,7 @@ class SymPyValueRangeAnalysis:
 
 
 class ValueRangeAnalysis(SymPyValueRangeAnalysis):
-    def __init__(self):
+    def __init__(self) -> None:
         self.name = "ValueRangeAnalysis"
         boolean_operators = (
             "xor",
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index b07439526bf..c61b78d42d8 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -28,7 +28,7 @@ if IS_WINDOWS:
     # is gone, and the only way to check it through OS is to let the worker have a process handle
     # of the manager and ask if the process status has changed.
     class ManagerWatchdog:
-        def __init__(self):
+        def __init__(self) -> None:
             self.manager_pid = os.getppid()
 
             # mypy cannot detect this code is windows only
@@ -60,7 +60,7 @@ if IS_WINDOWS:
 else:
 
     class ManagerWatchdog:  # type: ignore[no-redef]
-        def __init__(self):
+        def __init__(self) -> None:
             self.manager_pid = os.getppid()
             self.manager_dead = False
 
diff --git a/torch/utils/module_tracker.py b/torch/utils/module_tracker.py
index 9feef40ca4d..01e966c712b 100644
--- a/torch/utils/module_tracker.py
+++ b/torch/utils/module_tracker.py
@@ -52,7 +52,7 @@ class ModuleTracker:
     A Set containing the fqn for each module currently running their forward
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.parents = {"Global"}
         self._known_modules: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
         self._seen_modules: weakref.WeakSet = weakref.WeakSet()