diff --git a/.circleci/scripts/binary_ios_test.sh b/.circleci/scripts/binary_ios_test.sh
index 863b21724a5..b9028b4a001 100644
--- a/.circleci/scripts/binary_ios_test.sh
+++ b/.circleci/scripts/binary_ios_test.sh
@@ -24,6 +24,6 @@ rm cert.txt
 if ! [ -x "$(command -v xcodebuild)" ]; then
     echo 'Error: xcodebuild is not installed.'
     exit 1
-fi 
+fi
 PROFILE=PyTorch_CI_2021
 ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 8a22351d9e2..08e367af8b3 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -40,6 +40,9 @@ jobs:
           rm -r "shellcheck-${scversion}"
           shellcheck --version
           .jenkins/run-shellcheck.sh
+      - name: Ensure no trailing spaces
+        run: |
+          (! git grep -I -l ' $' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' || (echo "The above files have trailing spaces; please remove them"; false))
       - name: Ensure no tabs
         run: |
           (! git grep -I -l $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have tabs; please convert them to spaces"; false))
diff --git a/.jenkins/caffe2/bench.sh b/.jenkins/caffe2/bench.sh
index be54aaed05d..a72d37688f1 100755
--- a/.jenkins/caffe2/bench.sh
+++ b/.jenkins/caffe2/bench.sh
@@ -21,7 +21,7 @@ if (( $num_gpus == 0 )); then
 fi
 if (( $num_gpus >= 1 )); then
     "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 128 --epoch_size 12800 --num_epochs 2 --num_gpus 1
-    # Let's skip the fp16 bench runs for now, as it recompiles the miopen kernels and can take 10+min to run. 
+    # Let's skip the fp16 bench runs for now, as it recompiles the miopen kernels and can take 10+min to run.
     # We can resume when we (1) bindmount the miopen cache folder in jenkins; (2) install the pre-compiled miopen kernel library in the docker
     # "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 256 --epoch_size 25600 --num_epochs 2 --num_gpus 1 --float16_compute --dtype float16
 fi
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cb31a7f7ae4..7c95483b5be 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,7 +159,7 @@ with `brew install cmake` if you are developing on MacOS or Linux system.
     check whether your Git local or global config file contains any `submodule.*` settings. If yes, remove them and try again.
     (please reference [this doc](https://git-scm.com/docs/git-config#Documentation/git-config.txt-submoduleltnamegturl) for more info).
 
-  - If you encountered error such as 
+  - If you encountered error such as
     ```
     fatal: unable to access 'https://github.com/pybind11/pybind11.git': could not load PEM client certificate ...
     ```
@@ -169,11 +169,11 @@ with `brew install cmake` if you are developing on MacOS or Linux system.
     openssl x509 -noout -in <cert_file> -dates
     ```
 
-  - If you encountered error that some third_party modules are not checkout correctly, such as 
+  - If you encountered error that some third_party modules are not checkout correctly, such as
     ```
     Could not find .../pytorch/third_party/pybind11/CMakeLists.txt
     ```
-    remove any `submodule.*` settings in your local git config (`.git/config` of your pytorch repo) and try again.  
+    remove any `submodule.*` settings in your local git config (`.git/config` of your pytorch repo) and try again.
 
 ## Nightly Checkout & Pull
 
diff --git a/GLOSSARY.md b/GLOSSARY.md
index bd1df8bdc97..24300e7f722 100644
--- a/GLOSSARY.md
+++ b/GLOSSARY.md
@@ -1,4 +1,4 @@
-# PyTorch Glossary 
+# PyTorch Glossary
 
 - [PyTorch Glossary](#pytorch-glossary)
 - [Operation and Kernel](#operation-and-kernel)
@@ -39,7 +39,7 @@ For example, this
 to create Custom Operations.
 
 ## Kernel
-Implementation of a PyTorch operation, specifying what should be done when an 
+Implementation of a PyTorch operation, specifying what should be done when an
 operation executes.
 
 ## Compound Operation
@@ -57,7 +57,7 @@ Same as Compound Operation.
 ## Leaf Operation
 An operation that's considered a basic operation, as opposed to a Compound
 Operation. Leaf Operation always has dispatch functions defined, usually has a
-derivative function defined as well. 
+derivative function defined as well.
 
 ## Device Kernel
 Device-specific kernel of a leaf operation.
@@ -79,4 +79,4 @@ using just-in-time compilation.
 
 ## Scripting
 Using `torch.jit.script` on a function to inspect source code and compile it as
-TorchScript code.
\ No newline at end of file
+TorchScript code.
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 53eec3025d2..0c7ffd0930f 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -300,7 +300,7 @@ Tensor trace_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes)
   auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
   // Batched Diagonal View
   auto grad_input_diag = at::diagonal(grad_input, /*offset*/0, /*dim1*/-2, /*dim2*/-1);
-  // Append a dimension of size one to the grad output 
+  // Append a dimension of size one to the grad output
   auto grad_physical_tensor = grad_physical.tensor().unsqueeze(-1);
   grad_input_diag.copy_(grad_physical_tensor);
   return grad_physical.getPhysicalToLogicalMap().apply(grad_input);
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index aab4b4d702c..5f909f0e5b5 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -38,7 +38,7 @@ struct CPUGeneratorImplStateLegacy {
  * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used
  * as a helper for torch.get_rng_state() and torch.set_rng_state()
  * functions.
- */ 
+ */
 struct CPUGeneratorImplState {
   CPUGeneratorImplStateLegacy legacy_pod;
   float next_float_normal_sample;
@@ -119,7 +119,7 @@ uint64_t CPUGeneratorImpl::seed() {
  * must be a strided CPU byte tensor and of the same size as either
  * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
  * CPUGeneratorImplState (for new state).
- * 
+ *
  * FIXME: Remove support of the legacy state in the future?
  */
 void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
diff --git a/aten/src/ATen/SparseTensorUtils.h b/aten/src/ATen/SparseTensorUtils.h
index 1814930358b..a0f984267a9 100644
--- a/aten/src/ATen/SparseTensorUtils.h
+++ b/aten/src/ATen/SparseTensorUtils.h
@@ -94,7 +94,7 @@ TORCH_API Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, b
 //   new_indices = [ 3, 1, 3 ]  # uncoalesced
 TORCH_API Tensor flatten_indices_by_dims(const Tensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten);
 
-// Find the CSR representation for a row `indices` from the COO format 
+// Find the CSR representation for a row `indices` from the COO format
 TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz);
 
 }} // namespace at::sparse
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index fe03506743c..edcb7bfb632 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -114,7 +114,7 @@ std::string used_cpu_capability() {
     case native::CPUCapability::AVX2:
       ss << "AVX2";
       break;
-#endif      
+#endif
     default:
       break;
   }
diff --git a/aten/src/ATen/VmapTransforms.h b/aten/src/ATen/VmapTransforms.h
index 8fa08524545..7b418c73a91 100644
--- a/aten/src/ATen/VmapTransforms.h
+++ b/aten/src/ATen/VmapTransforms.h
@@ -47,7 +47,7 @@ using VmapDimVector = SmallVector<int64_t, kVmapStaticDimVecSize>;
 // argument.
 
 // VmapTransform for operators that take tensors with multiple batch dims.
-// Given one or more logical views on Tensors, `logicalToPhysical` 
+// Given one or more logical views on Tensors, `logicalToPhysical`
 // permutes all of the batch dims to the front of the tensor, aligns
 // and expands the batch dims to match each other (according to their `level`),
 // and returns a VmapPhysicalView on the tensor(s).
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index b5bbb2fe3c7..1e6e8d54fa7 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -143,7 +143,7 @@ namespace detail {
 /**
  * Helper function for checking the validity of new random generator
  * state. Right now following conditions are checked:
- * 
+ *
  * - The new state tensor must be a torch.ByteTensor
  * - Data of the new state tensor must be contiguous
  */
diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index 7f43c5f11d1..d075d7dd6fb 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -40,13 +40,13 @@ typedef at::detail::Array<float, 2> FLOAT2;
  * Note that currently this implementation of the philox engine is not used
  * anywhere except for tests in cpu_generator_test.cpp. However, this engine
  * will replace curandStatePhilox4_32_10_t in the future.
- * 
+ *
  * The philox engine takes a seed value, a subsequeunce
  * for starting the generation and an offset for the subsequence.
- * Think of this engine as an algorithm producing a huge array. We are 
- * parallelizing this array by partitioning the huge array and assigning 
- * a thread index to each partition. In other words, each seed value 
- * (there are 2^64 possible seed values) gives a sub array of size 
+ * Think of this engine as an algorithm producing a huge array. We are
+ * parallelizing this array by partitioning the huge array and assigning
+ * a thread index to each partition. In other words, each seed value
+ * (there are 2^64 possible seed values) gives a sub array of size
  * 2^128 (each element in that array is a 128 bit number). Reasoning
  * behind the array being of size 2^128 is, there are 2^64 possible
  * thread index value and there is an array of size 2^64 for each of
@@ -59,9 +59,9 @@ typedef at::detail::Array<float, 2> FLOAT2;
  * seed:        Seed values could be any number from 0 to 2^64-1.
  * subsequence: Subsequence is just the cuda thread indexing with:
  *              - blockIdx.x * blockDim.x + threadIdx.x
- * offset:      The offset variable in PhiloxEngine  decides how many 128-bit 
+ * offset:      The offset variable in PhiloxEngine  decides how many 128-bit
  *              random numbers to skip (i.e. how many groups of 4, 32-bit numbers to skip)
- *              and hence really decides the total number of randoms that can be achieved 
+ *              and hence really decides the total number of randoms that can be achieved
  *              for the given subsequence.
  */
 
diff --git a/aten/src/ATen/core/op_registration/README.md b/aten/src/ATen/core/op_registration/README.md
index b48cd56d938..ffbc5f3b3da 100644
--- a/aten/src/ATen/core/op_registration/README.md
+++ b/aten/src/ATen/core/op_registration/README.md
@@ -254,5 +254,3 @@ Also, there's some requirements on the operator schema for it to be callable fro
 * Except for `Tensor` or `Tensor[]`, only arguments of type `int`, `double` and `bool` are supported. These can be in any position in the argument list and will be read from the caffe2 operator arguments, based on the argument name in the operator schema.
 * We do not support lists (`int[]`, `double[]` or `bool[]`) or optionals (`int?`, `double?`, `bool?`) yet.
 * The operator must return a single `Tensor` or multiple tensors as in `(Tensor, Tensor, Tensor)`. It cannot return a list `Tensor[]`, optional `Tensor?` or any primitive types.
-    
-
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 87b3ca31921..4b488973235 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -1124,12 +1124,12 @@ std::string ClassType::getForwardPreHookErrorMessage(int pre_hook_idx) const {
   const FunctionSchema& forward_schema = getMethod("forward").getSchema();
   std::string input_types = getSchemaInputTypesString(forward_schema);
   const std::vector<Argument>& forward_args = forward_schema.arguments();
-   
+
   std::string single_output = "";
   if (forward_args.size() == 2 &&
       forward_args[1].type()->cast<TupleType>() == nullptr) {
     // if the output type is a single tuple, it needs to be wrapped in an outer tuple
-    // to match eager's behavior 
+    // to match eager's behavior
     single_output = ", '" + forward_args[1].type()->annotation_str() + "',";
   }
   std::string pre_hook_schema =
@@ -1138,9 +1138,9 @@ std::string ClassType::getForwardPreHookErrorMessage(int pre_hook_idx) const {
       "This error occured while scripting the forward pre-hook '" +
       pre_hook_name + "' on module '" + name()->name() +
       "'. If you did not want to script this pre-hook remove it from the "
-      "original NN module before scripting. Pre-hooks for module '" + 
-      name()->name() + "' are expected to have the following signature: " 
-      + pre_hook_schema + " with a return type of either 'None'" + 
+      "original NN module before scripting. Pre-hooks for module '" +
+      name()->name() + "' are expected to have the following signature: "
+      + pre_hook_schema + " with a return type of either 'None'" +
       single_output + " or 'Tuple[" + input_types + "]'.";
   return return_string;
 }
@@ -1148,7 +1148,7 @@ std::string ClassType::getForwardPreHookErrorMessage(int pre_hook_idx) const {
 std::string ClassType::getForwardHookErrorMessage(int hook_idx) const {
   const std::string& hook_name = forward_hooks_[hook_idx]->name();
   const FunctionSchema& forward_schema = getMethod("forward").getSchema();
-  std::string input_types = getSchemaInputTypesString(forward_schema); 
+  std::string input_types = getSchemaInputTypesString(forward_schema);
 
   // create expected output types string
   const Argument& pre_output =
@@ -1160,33 +1160,33 @@ std::string ClassType::getForwardHookErrorMessage(int hook_idx) const {
   std::string hook_schema = hook_name + "(self, input: Tuple[" +
                             input_types + "], output: " + output_types + ")";
   std::string return_string =
-      "This error occured while scripting the forward hook '" 
+      "This error occured while scripting the forward hook '"
       + hook_name + "' on module " + name()->name() +
       ". If you did not want to script this hook remove it from" +
       " the original NN module before scripting. This hook was" +
       " expected to have the following signature: " + hook_schema +
-      ". The type of the output arg is the returned type from" + 
-      " either the forward method or the previous hook if it exists. " + 
-      "Note that hooks can return anything, but if the hook is " + 
+      ". The type of the output arg is the returned type from" +
+      " either the forward method or the previous hook if it exists. " +
+      "Note that hooks can return anything, but if the hook is " +
       "on a submodule the outer module is expecting" +
       " the same return type as the submodule's forward.";
   return return_string;
 }
 
 void checkForwardHookInputArguments(
-    const FunctionSchema& forward_schema, 
-    const FunctionSchema& hook_schema, 
-    const std::string& hook_id, 
+    const FunctionSchema& forward_schema,
+    const FunctionSchema& hook_schema,
+    const std::string& hook_id,
     const std::string& hook_err_msg) {
   // check for proper tuple input types
   const std::vector<Argument>& forward_args = forward_schema.arguments();
   const Argument input_arg = hook_schema.arguments()[1];
   TORCH_CHECK(
-      input_arg.type()->cast<TupleType>() != nullptr, 
+      input_arg.type()->cast<TupleType>() != nullptr,
       hook_id,
       "expected the input argument to be typed as a Tuple but found type: '",
-      input_arg.type()->annotation_str(), 
-      "' instead.\n", 
+      input_arg.type()->annotation_str(),
+      "' instead.\n",
       hook_err_msg
    );
 
@@ -1229,7 +1229,7 @@ void checkForwardHookInputArguments(
 }
 
 void ClassType::checkForwardPreHookSchema(
-    int pre_hook_idx, 
+    int pre_hook_idx,
     const FunctionSchema& pre_hook_schema) const {
   const torch::jit::Function* pre_hook = forward_pre_hooks_[pre_hook_idx];
   std::string hook_id =
@@ -1261,7 +1261,7 @@ void ClassType::checkForwardPreHookSchema(
             pre_hook_err_msg
   );
   const Argument return_arg = pre_hook_schema.returns()[0];
-  std::string wrong_type_returned_err_msg = hook_id + 
+  std::string wrong_type_returned_err_msg = hook_id +
       "returned the wrong type of: '" +
       return_arg.type()->annotation_str() + "'.";
 
@@ -1269,9 +1269,9 @@ void ClassType::checkForwardPreHookSchema(
     return;
   }
   if (forward_args.size() == 2 && *forward_args[1].type() == *return_arg.type()) {
-    // TORCH_CHECK below is for the edge case where forward's input is a tuple and the 
+    // TORCH_CHECK below is for the edge case where forward's input is a tuple and the
     // pre-hook returns a matching tuple. Eager doesn't support this- the working eager return
-    // for a tuple type is the forward's input tuple wrapped inside of another tuple. 
+    // for a tuple type is the forward's input tuple wrapped inside of another tuple.
     TORCH_CHECK(
         return_arg.type()->cast<TupleType>() == nullptr,
         wrong_type_returned_err_msg,
@@ -1316,7 +1316,7 @@ void ClassType::checkForwardPreHookSchema(
   for (int i = 1; i < forward_args.size(); ++i) {
     if (*forward_args[i].type() != *return_tuple_types[i - 1]) {
       TORCH_CHECK(
-          false, 
+          false,
           wrong_type_returned_err_msg,
           " The returned tuple contains the wrong inner types.\n",
           pre_hook_err_msg);
@@ -1325,7 +1325,7 @@ void ClassType::checkForwardPreHookSchema(
 }
 
 void ClassType::checkForwardHookSchema(
-      int hook_idx, 
+      int hook_idx,
       const FunctionSchema& hook_schema) const {
   const torch::jit::Function* hook = forward_hooks_[hook_idx];
   std::string hook_id =
@@ -1388,8 +1388,8 @@ torch::jit::Function& ClassType::getMethod(const std::string& name) const {
 torch::jit::Function* ClassType::findHook(const std::string& name) const {
   auto hook = findForwardHook(name);
   if (hook == nullptr) {
-    hook = findForwardPreHook(name); 
-  } 
+    hook = findForwardPreHook(name);
+  }
   return hook;
 }
 
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index 75a423d62fe..f7f6d594d35 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -113,7 +113,7 @@ public:
     const auto not_nan_mask = _mm256_cmp_pd(values, values, _CMP_EQ_OQ);
     const auto nan_mask = _mm256_cmp_pd(not_nan_mask, zero_vec, _CMP_EQ_OQ);
     const auto pi = _mm256_set1_pd(c10::pi<double>);
-    
+
     const auto neg_mask = _mm256_cmp_pd(values, zero_vec, _CMP_LT_OQ);
     auto angle = _mm256_blendv_pd(zero_vec, pi, neg_mask);
     angle = _mm256_blendv_pd(angle, nan_vec, nan_mask);
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index 62786beef57..39dac1a5ecd 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -120,7 +120,7 @@ public:
     const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ);
     const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
     const auto pi = _mm256_set1_ps(c10::pi<float>);
-    
+
     const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ);
     auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
     angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h
index f62ac36850b..86133586112 100644
--- a/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h
@@ -364,7 +364,7 @@ class Vec256<ComplexDbl> {
   }
 
   Vec256<ComplexDbl> sqrt() const {
-    return map(std::sqrt);  
+    return map(std::sqrt);
   }
 
   Vec256<ComplexDbl> reciprocal() const {
diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h
index 751cba0ae51..c14acbb24e9 100644
--- a/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h
@@ -417,7 +417,7 @@ class Vec256<ComplexFlt> {
   }
 
   Vec256<ComplexFlt> sqrt() const {
-    return map(std::sqrt);  
+    return map(std::sqrt);
   }
 
   Vec256<ComplexFlt> reciprocal() const {
diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h
index f4fa4faa801..7f7b067cb6d 100644
--- a/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h
@@ -82,7 +82,7 @@ class Vec256<double> {
       blend(const Vec256<double>& a, const Vec256<double>& b) {
       return { a._vec0, b._vec1 };
   }
- 
+
 
   template <int64_t mask>
   static std::enable_if_t<blendChoiceDbl(mask) == 4, Vec256<double>> C10_ALWAYS_INLINE
@@ -206,7 +206,7 @@ class Vec256<double> {
     for (int i = 0; i < size()/2; i++) {
         ret._vec0[i] = f(_vec0[i], other._vec0[i]);
     }
-    for (int i = 0; i < size()/2; i++) {   
+    for (int i = 0; i < size()/2; i++) {
         ret._vec1[i] = f(_vec1[i], other._vec1[i]);
     }
     return ret;
@@ -314,7 +314,7 @@ class Vec256<double> {
   Vec256<double> C10_ALWAYS_INLINE sqrt() const {
     return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
   }
-  Vec256<double> C10_ALWAYS_INLINE reciprocal() const { 
+  Vec256<double> C10_ALWAYS_INLINE reciprocal() const {
     return {
         vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one.
         vec_div(vd_one, _vec1)};
diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h
index a47e295ce03..b47988e2eaa 100644
--- a/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h
+++ b/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h
@@ -134,11 +134,11 @@ struct Vec256<c10::qint32> {
     Vec256<float> vf0 = rhs[0];
 
     vfloat32 vecf0 = vf0.vec0();
-    vfloat32 vecf1 = vf0.vec1(); 
+    vfloat32 vecf1 = vf0.vec1();
     vecf0 = vec_mul(vecf0, inverse_scale_v);
     vecf1 = vec_mul(vecf1, inverse_scale_v);
     vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
-    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);     
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
     vint32 veci0  = vec_signed(vecf0);
     vint32 veci1  = vec_signed(vecf1);
 
@@ -171,7 +171,7 @@ struct Vec256<c10::qint32> {
       float multiplier,
       int32_t zero_point) {
     const vint32 vmin = vec_splats(std::numeric_limits<value_type>::min());
-    const vint32 vmax = vec_splats(std::numeric_limits<value_type>::max());        
+    const vint32 vmax = vec_splats(std::numeric_limits<value_type>::max());
     vfloat32 vec_mult = vec_splats(multiplier);
     vint32 vec_zero_point = vec_splats(zero_point);
     Vec256<c10::qint32> vi = inp[0];
diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h
index f8b6eced60e..620351df9ad 100644
--- a/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h
+++ b/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h
@@ -337,7 +337,7 @@ struct Vec256<c10::qint8> {
     vint32 veci4 = vec_signed(vecf4);
     vint32 veci5 = vec_signed(vecf5);
     vint32 veci6 = vec_signed(vecf6);
-    vint32 veci7 = vec_signed(vecf7); 
+    vint32 veci7 = vec_signed(vecf7);
 
     veci0 = vec_add(veci0, vec_zero_point);
     veci1 = vec_add(veci1, vec_zero_point);
@@ -348,7 +348,7 @@ struct Vec256<c10::qint8> {
     veci5 = vec_add(veci5, vec_zero_point);
     veci6 = vec_add(veci6, vec_zero_point);
     veci7 = vec_add(veci7, vec_zero_point);
- 
+
     vint16 vecshi0 = vec_packs(veci0, veci1);
     vint16 vecshi1 = vec_packs(veci2, veci3);
     vint16 vecshi2 = vec_packs(veci4, veci5);
diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h
index 96809ce3259..3015a42386d 100644
--- a/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h
+++ b/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h
@@ -345,8 +345,8 @@ struct Vec256<c10::quint8> {
     vint32 veci4 = vec_signed(vecf4);
     vint32 veci5 = vec_signed(vecf5);
     vint32 veci6 = vec_signed(vecf6);
-    vint32 veci7 = vec_signed(vecf7); 
-    
+    vint32 veci7 = vec_signed(vecf7);
+
     veci0 = vec_add(veci0, vec_zero_point);
     veci1 = vec_add(veci1, vec_zero_point);
     veci2 = vec_add(veci2, vec_zero_point);
@@ -356,11 +356,11 @@ struct Vec256<c10::quint8> {
     veci5 = vec_add(veci5, vec_zero_point);
     veci6 = vec_add(veci6, vec_zero_point);
     veci7 = vec_add(veci7, vec_zero_point);
- 
+
     vint16 vecshi0 = vec_packs(veci0, veci1);
     vint16 vecshi1 = vec_packs(veci2, veci3);
     vint16 vecshi2 = vec_packs(veci4, veci5);
-    vint16 vecshi3 = vec_packs(veci6, veci7);  
+    vint16 vecshi3 = vec_packs(veci6, veci7);
 
     vuint8 vec0 = vec_packsu(vecshi0, vecshi1);
     vuint8 vec1 = vec_packsu(vecshi2, vecshi3);
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 42e212c1814..145efe01502 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -193,7 +193,7 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   } else {
     TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
   }
-  
+
   uint64_t input_seed;
   auto new_rng_state = new_state.data<uint8_t>();
   memcpy(&input_seed, new_rng_state + states_size, seed_size);
diff --git a/aten/src/ATen/native/AffineGridGenerator.cpp b/aten/src/ATen/native/AffineGridGenerator.cpp
index cbb7137dd2a..fc5b22324ea 100644
--- a/aten/src/ATen/native/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/AffineGridGenerator.cpp
@@ -3,7 +3,7 @@
 
 namespace at { namespace native {
 
-at::Tensor linspace_from_neg_one(const Tensor& grid, int64_t num_steps, 
+at::Tensor linspace_from_neg_one(const Tensor& grid, int64_t num_steps,
                                  bool align_corners) {
   if (num_steps <= 1) {
     return at::tensor(0, grid.options());
diff --git a/aten/src/ATen/native/AveragePool2d.cpp b/aten/src/ATen/native/AveragePool2d.cpp
index e7834fcf339..f09f8bfa701 100644
--- a/aten/src/ATen/native/AveragePool2d.cpp
+++ b/aten/src/ATen/native/AveragePool2d.cpp
@@ -96,7 +96,7 @@ void avg_pool2d_out_cpu_template(
           Tensor &output,
           const Tensor &input_,
           IntArrayRef kernel_size,
-          IntArrayRef stride, 
+          IntArrayRef stride,
           IntArrayRef padding,
           bool ceil_mode,
           bool count_include_pad,
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index 293d6789872..1a1f6737f23 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -11,11 +11,11 @@ extern "C" void zgemm_(char *transa, char *transb, int *m, int *n, int *k, void
 #endif  // AT_BUILD_WITH_BLAS()
 
 #if AT_BUILD_WITH_BLAS()
-extern "C" void cswap_(int *n, const void *x, int *incx, void *y, int *incy); 
+extern "C" void cswap_(int *n, const void *x, int *incx, void *y, int *incy);
 extern "C" void dcopy_(int *n, const double *x, int *incx, double *y, int *incy);
 extern "C" void scopy_(int *n, const float *x, int *incx, float *y, int *incy);
-extern "C" void zcopy_(int *n, const void *x, int *incx, void *y, int *incy); 
-extern "C" void ccopy_(int *n, const void *x, int *incx, void *y, int *incy); 
+extern "C" void zcopy_(int *n, const void *x, int *incx, void *y, int *incy);
+extern "C" void ccopy_(int *n, const void *x, int *incx, void *y, int *incy);
 extern "C" void daxpy_(int *n, double *a, const double *x, int *incx, double *y, int *incy);
 extern "C" void saxpy_(int *n, float *a, const float *x, int *incx, float *y, int *incy);
 extern "C" void caxpy_(int *n, void *a, const void *x, int *incx, void *y, int *incy);
@@ -279,7 +279,7 @@ void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t
     daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   axpy_stub(
       kCPU, at::kDouble,
       n, a, x, incx, y, incy);
@@ -300,7 +300,7 @@ void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t in
     saxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   axpy_stub(
       kCPU, at::kFloat,
       n, a, x, incx, y, incy);
@@ -321,7 +321,7 @@ void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int6
     zaxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   axpy_stub(
       kCPU, at::kComplexDouble,
       n, a, x, incx, y, incy);
@@ -342,7 +342,7 @@ void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_
     caxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   axpy_stub(
       kCPU, at::kComplexFloat,
       n, a, x, incx, y, incy);
@@ -364,7 +364,7 @@ void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy) {
     dcopy_(&i_n, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   copy_stub(
       kCPU, at::kDouble,
       n, x, incx, y, incy);
@@ -384,7 +384,7 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy) {
     scopy_(&i_n, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   copy_stub(
       kCPU, at::kFloat,
       n, x, incx, y, incy);
@@ -404,7 +404,7 @@ void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<d
     zcopy_(&i_n, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   copy_stub(
       kCPU, at::kComplexDouble,
       n, x, incx, y, incy);
@@ -424,10 +424,10 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
     ccopy_(&i_n, x, &i_incx, y, &i_incy);
     return;
   }
-  #endif 
+  #endif
   copy_stub(
       kCPU, at::kComplexFloat,
       n, x, incx, y, incy);
 }
- 
+
 }}}  // namespace at::native::cpublas
diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index 3fde6dbb77e..2e2b618548a 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -4,7 +4,7 @@
 
 namespace at { namespace native {
 
-// View tensor with new dtype, storage offset, sizes and strides 
+// View tensor with new dtype, storage offset, sizes and strides
 inline Tensor view_tensor(
     const Tensor &tensor, ScalarType dtype,
     int64_t offset, IntArrayRef sizes, IntArrayRef strides) {
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 2a733bd7a99..b7a1f52bbfd 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -128,7 +128,7 @@ C10_DEVICE static inline scalar_t polevl(const scalar_t x,  const scalar_t A[],
 }
 
 /* the functions stirling_approx_tail, binomial_inversion, and btrs are adapted
- * from TensorFlow's random_binomial_op.cc implementation. That code is under 
+ * from TensorFlow's random_binomial_op.cc implementation. That code is under
  * copyright: 2019 The TensorFlow Authors.
  *
  * It was released under the Apache License, Version 2.0 (the "License"), available at:
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 0055571b386..d29423ef80f 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -4,7 +4,7 @@
 namespace at {
 namespace native {
 namespace {
-// Check foreach API restrictions 
+// Check foreach API restrictions
 // - Tensor lists must be non-empty.
 // - All tensors in all lists must have the same dtype.
 // - All TensorLists and ScalarLists must have the same number of elements.
diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp
index 4789255f408..05fc64c4dea 100644
--- a/aten/src/ATen/native/GatedLinearUnit.cpp
+++ b/aten/src/ATen/native/GatedLinearUnit.cpp
@@ -51,7 +51,7 @@ Tensor& glu_backward_out(Tensor& grad_input,
   Tensor secondHalf = input.narrow(wrap_dim, inputSize, inputSize);
   Tensor gradInputfirstHalf = grad_input.narrow(wrap_dim, 0, inputSize);
   Tensor gradInputsecondHalf = grad_input.narrow(wrap_dim, inputSize, inputSize);
-  
+
   at::sigmoid_out(gradInputfirstHalf, secondHalf);
   // for second gradinput half, can get a better performance by fusion
   auto iter = at::TensorIteratorConfig()
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 667cbe8f07b..26cbaa009dc 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -485,7 +485,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
               }
             }
           } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
-            // grid_sampler_compute_source_index will "clip the value" of idx depends on the padding, 
+            // grid_sampler_compute_source_index will "clip the value" of idx depends on the padding,
             // which would cause calculation to be wrong,
             // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix = floor(x) = -1
             // There would be more problem in reflection padding, since the -1 and +1 direction is not fixed in boundary condition
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index d59e3a3bf16..743f18c00c9 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -10,7 +10,7 @@
 namespace at {
 namespace native {
 namespace {
-  
+
 static void im2col_out_cpu_template(
     Tensor& output,
     const Tensor& input_,
diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h
index 4282c346702..54226e888f4 100644
--- a/aten/src/ATen/native/LossMulti.h
+++ b/aten/src/ATen/native/LossMulti.h
@@ -56,7 +56,7 @@ namespace {
       nframe = input.size(0);
       dim = input.size(1);
     }
-    
+
     TORCH_CHECK(
                 valid_inputs,
                 "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index e30839afca9..3cd0f46e0a9 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -40,7 +40,7 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
       }
     }
   }
-  
+
   return sum;
 }
 
@@ -103,7 +103,7 @@ static void multilabel_margin_loss_forward_out_cpu_template(
     int64_t reduction) {
   auto target_arg = TensorArg(target, "target", 2);
   int64_t nframe, dim;
-  const int64_t ndims = input.dim();  
+  const int64_t ndims = input.dim();
   if (ndims <= 1) {
     nframe = 1;
     dim = ndims == 0 ? 1 : input.size(0);
@@ -113,7 +113,7 @@ static void multilabel_margin_loss_forward_out_cpu_template(
     dim = input.size(1);
   }
   multilabel_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
-  
+
   // special case target.dim() <= 1: produce scalar output for scalar inputs
   // even if reduction == Reduction::None
   if (reduction != Reduction::None || target.dim() <= 1) {
@@ -228,12 +228,12 @@ static void multilabel_margin_loss_backward_out_cpu_template(
 
   multilabel_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
   checkSameSize(c, target_arg, is_target_arg);
-  
+
   grad_input.resize_as_(input);
   if (grad_input.numel() == 0) {
     return;
   }
-  
+
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
   grad_input.zero_();
 
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index 7bc7f1fcf72..ef7ee517b74 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -205,7 +205,7 @@ void multi_margin_loss_backward_out_cpu_template(
   int64_t nframe, dim;
   auto target_arg = TensorArg(target, "target", 2);
   const auto ndims = input.dim();
-  
+
   TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
 
   multi_margin_loss_shape_check(nframe, dim, ndims, target_arg, input, target);
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 7a33d12a072..5ba9358edea 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -153,7 +153,7 @@ static void nll_loss2d_forward_out_frame(
   for (int64_t b = 0; b < batch_size; b++) {
     for (int64_t elem = 0; elem < map_size; elem++) {
       const int64_t cur_target = target_data[b * map_size + elem];
-      
+
       if (cur_target == ignore_index) {
         continue;
       }
@@ -284,7 +284,7 @@ static void nll_loss2d_backward_out_frame(
     for (int64_t b = start; b < end; b++) {
       for (int64_t elem = 0; elem < map_size; elem++) {
         const int64_t cur_target = target_data[b * map_size + elem];
-        
+
         if (cur_target == ignore_index) {
           continue;
         }
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 60d7f8a419d..3e06bb8fa4d 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -68,7 +68,7 @@ pool2d_shape_check(
   TORCH_CHECK(dilationH > 0 && dilationW > 0,
               "dilation should be greater than zero, but got ",
               "dilationH: ", dilationH, " dilationW: ", dilationW);
-  
+
   bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
   if (memory_format == at::MemoryFormat::ChannelsLast){
     // Expect tensor in NHWC format and allow 0-dim only for N.
@@ -81,7 +81,7 @@ pool2d_shape_check(
       "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:",
       input.sizes());
   }
-  
+
   TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
               "pad should be smaller than or equal to half of kernel size, but got ",
               "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
diff --git a/aten/src/ATen/native/Pow.h b/aten/src/ATen/native/Pow.h
index f195bddb6f3..50d105394da 100644
--- a/aten/src/ATen/native/Pow.h
+++ b/aten/src/ATen/native/Pow.h
@@ -16,7 +16,7 @@ namespace native {
 #endif
 
 // integral power in pytorch allows for negative exponents, giving truncated integral results.
-// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the 
+// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the
 // only non-zero result.
 template <class T,
   typename std::enable_if<std::is_integral<T>::value, T>::type* = nullptr>
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index ec88b170f5f..0fb7dfbe5e8 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -219,7 +219,7 @@ Tensor& replication_pad1d_backward_out_cpu_template(
   gradInput.resize_as_(input);
   if (gradInput.numel() == 0) {
     return gradInput;
-  }           
+  }
   gradInput.zero_();
 
   /* backprop */
@@ -522,7 +522,7 @@ Tensor& replication_pad2d_backward_out_cpu_template(
   if (gradInput.numel() == 0) {
     return gradInput;
   }
-  
+
   gradInput.zero_();
 
   /* backprop */
diff --git a/aten/src/ATen/native/StridedRandomAccessor.h b/aten/src/ATen/native/StridedRandomAccessor.h
index 32a6c011744..bb7b2155cd3 100644
--- a/aten/src/ATen/native/StridedRandomAccessor.h
+++ b/aten/src/ATen/native/StridedRandomAccessor.h
@@ -143,7 +143,7 @@ public:
     return (ptr - other.ptr) / stride;
   }
   // }
-  
+
   // Comparison operators {
   C10_HOST_DEVICE
   bool operator==(const ConstStridedRandomAccessor& other) const {
@@ -175,7 +175,7 @@ public:
     return !(*this < other);
   }
   // }
-  
+
 protected:
   PtrType ptr;
   index_t stride;
@@ -186,7 +186,7 @@ template <
   typename index_t = int64_t,
   template <typename U> class PtrTraits = DefaultPtrTraits
 >
-class StridedRandomAccessor 
+class StridedRandomAccessor
   : public ConstStridedRandomAccessor<T, index_t, PtrTraits> {
 public:
   using difference_type = index_t;
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
index aaacc0941a1..356134666d5 100644
--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -12,7 +12,7 @@ namespace native {
 static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size, IntArrayRef dims) {
   if (flip_dims_size==0) {
     return;
-  } 
+  }
   // check if number of axis in dim is valid
   if (flip_dims_size < 0 || flip_dims_size > total_dims) {
     TORCH_CHECK_INDEX(false, "flip dims size out of range, got flip dims size=", flip_dims_size);
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index ece2d527e89..ef3418ff677 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -844,14 +844,14 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
     auto mask_x = must_in_bound ? iVec(-1) : (ix > iVec(-1)) & (ix < iVec(inp_W));
     auto mask_y = must_in_bound ? iVec(-1) : (iy > iVec(-1)) & (iy < iVec(inp_H));
     auto mask = cast<scalar_t>(mask_x & mask_y);
-    
+
     auto offset = iy * iVec(inp_sH) + ix * iVec(inp_sW);
 
     auto val = mask_gather<sizeof(scalar_t)>(Vec(0), data, offset, mask);
     return val;
   }
 
-  inline void add_value_bounded(scalar_t* data, int64_t len, const Vec& x, const Vec&y, 
+  inline void add_value_bounded(scalar_t* data, int64_t len, const Vec& x, const Vec&y,
                                const Vec& delta) const {
 
     auto ix = convert_to_int_of_same_size(compute_W.compute_coordinates(x));
@@ -860,7 +860,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
     auto mask_x = must_in_bound ? iVec(-1) : (ix > iVec(-1)) & (ix < iVec(inp_W));
     auto mask_y = must_in_bound ? iVec(-1) : (iy > iVec(-1)) & (iy < iVec(inp_H));
     auto mask = cast<scalar_t>(mask_x & mask_y);
-    
+
     auto i_gInp_offset = iy * iVec(inp_W) + ix;
     integer_t i_gInp_offset_arr[iVec::size()];
     i_gInp_offset.store(i_gInp_offset_arr);
@@ -899,7 +899,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
       // Interpolate the 4 values in the x direction
       Vec interp_x[4];
       for (int64_t i = 0; i < 4; ++i) {
-        interp_x[i] = 
+        interp_x[i] =
           coeff_x[0] * get_value_bounded(inp_slice_C_ptr, ix - Vec(1), iy + Vec(-1 + i)) +
           coeff_x[1] * get_value_bounded(inp_slice_C_ptr, ix + Vec(0), iy + Vec(-1 + i)) +
           coeff_x[2] * get_value_bounded(inp_slice_C_ptr, ix + Vec(1), iy + Vec(-1 + i)) +
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 7ff81943e79..87f9a4e6d5c 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -76,7 +76,7 @@ inline void _vec_log_softmax_lastdim(
             scalar_t* output_data = output_data_base + i * dim_size;
             scalar_t tmp_sum = tmp_sum_scalar[j];
             scalar_t max_input = max_input_arr[j];
-            
+
             // It's necessary to keep the order of the operations below.
             // In some cases that input is large digits and the difference
             // is small, if we compute `max_input` plus `tmp_sum` before,
diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
index 20f55d6da6d..f8bfa09c708 100644
--- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
@@ -39,7 +39,7 @@
 // grad_in[...,i_in_dim,...,i_in_last_dim], where
 // i_in_dim is in [left_idx_fold, right_idx_fold],
 // i_in_last_dim = i_out_dim - i_in_dim * step,
-// left_idx_fold = (i_out_dim - size) / step 
+// left_idx_fold = (i_out_dim - size) / step
 //  if i_out_dim in [left_idx_fold * step, left_idx_fold * step + size)
 //  else (i_out_dim - size) / step + 1,
 // right_idx_fold = i_out_dim / step.
diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
index 49a746a22c3..468da96dcd4 100644
--- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
@@ -45,7 +45,7 @@ static inline void compute_source_index_and_lambda(
 
 // Helper structs and methods for cpu_upsample_linear
 //
-// Interpolation methods that used below are separable, and as such we can compute the interpolation 
+// Interpolation methods that used below are separable, and as such we can compute the interpolation
 // independently per dimension in a recursive way. Please, refer to #10482 for more context.
 //
 // Linear Interpolation structure to compute output value in n-dimensional case.
@@ -96,26 +96,26 @@ static inline bool is_contiguous_stride(const int64_t* strides) {
 }
 
 
-// Helper class to recursively check if all input strides corresponding to interpolated dimensions 
+// Helper class to recursively check if all input strides corresponding to interpolated dimensions
 // are equal zero except on a single dimension.
-// 
+//
 // Inputs: array of strides of size N, non_zero_stride_dim which can be -1, 0, 1, 2, ...
 //   if non_zero_stride_dim, we check that all strides are equal zero, otherwise
 //   4 strides corresponding to the strides for index_0, weight_0, index_1 and weight_1 for non_zero_stride_dim
 //   dimension should be non zero.
-// 
-// Unit check of the recursion is to verify whether 4 strides for one interpolated dimension are either zero, 
+//
+// Unit check of the recursion is to verify whether 4 strides for one interpolated dimension are either zero,
 // see method is_zero_stride, or (sizeof(index_t), sizeof(scalar_t), sizeof(index_t), sizeof(scalar_t)), see
 // method is_contiguous_stride.
-// 
+//
 // In practice, we have the following cases:
-// - for ND, float32, channel first, strides are 
+// - for ND, float32, channel first, strides are
 //         dimN-1,              dim1,           dim0
 //         i0, w0, i1, w1, ..., i0, w0, i1, w1, i0, w0, i1, w1
 // strides=(0,  0,  0,  0, ...,  0,  0,  0,  0,  4,  4,  4,  4)
 //
 // if size dim0 is 1 then its strides are 0 and dim1 strides are equal 4
-// 
+//
 // - for ND, float32, channel last, strides are
 //         dimN-1,         dimN-2,             dim0
 //         i0, w0, i1, w1, i0, w0, i1, w1, ... i0, w0, i1, w1
@@ -155,7 +155,7 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
 }
 
 // Linear upsampling computation method using TensorIterator for Nd case.
-// 
+//
 // Single loop function for 1d, 2d and 3d cases.
 // For N dimensions, output value up to Di dimension can be computed as
 //
@@ -505,7 +505,7 @@ void cpu_upsample_linear_backward(
 //
 template<typename scalar_t>
 std::vector<Tensor> compute_indices_weights_linear(
-  int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, 
+  int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim,
   bool align_corners, const c10::optional<double> opt_scale
 ) {
 
@@ -516,7 +516,7 @@ std::vector<Tensor> compute_indices_weights_linear(
   new_shape[reshape_dim] = output_size;
 
   output.emplace_back(empty(new_shape, CPU(at::kLong)));
-  output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>())));  
+  output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>())));
   output.emplace_back(empty(new_shape, CPU(at::kLong)));
   output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>())));
 
@@ -524,7 +524,7 @@ std::vector<Tensor> compute_indices_weights_linear(
   auto lambda0_ptr = output[1].data_ptr<scalar_t>();
   auto input_index1_ptr = output[2].data_ptr<int64_t>();
   auto lambda1_ptr = output[3].data_ptr<scalar_t>();
-  
+
   for (int64_t i=0; i<output_size; i++) {
 
     compute_source_index_and_lambda<scalar_t>(
@@ -543,7 +543,7 @@ std::vector<Tensor> compute_indices_weights_linear(
 }
 
 // Upsampling linear interpolation kernel for N-d case.
-// Input is assumed to be like NCHW, NCL, NCKHW - interpolated spatial dimension 
+// Input is assumed to be like NCHW, NCL, NCKHW - interpolated spatial dimension
 // are those from the end up to batch size N and number of channels C.
 //
 // Internally, it uses TensorIterator to optimize the computations.
@@ -588,8 +588,8 @@ void upsample_linearNd_kernel_impl(
     .declare_static_dtype_and_device(input.scalar_type(), input.device())
     .add_output(output)
     .add_input(restrided_input);
-  
-  for (auto iter=indices_weights.begin(); iter!=indices_weights.end(); iter++) { 
+
+  for (auto iter=indices_weights.begin(); iter!=indices_weights.end(); iter++) {
     for (auto& tensor : *iter) {
       config.add_input(tensor);
     }
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index ad720db1895..fa7df5d27cf 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -71,7 +71,7 @@ void batch_norm_cpu_inference_contiguous_impl(Tensor& output,
   if (image_size != 1) {
     const int64_t n_offset = n_channel * image_size;
     const int64_t loop_size = image_size - (image_size % Vec::size());
-    for (int64_t n = 0; n < n_batch; n++) { 
+    for (int64_t n = 0; n < n_batch; n++) {
       for (int64_t c = 0; c < n_channel; c++) {
         const Vec alpha_vec(alpha_data[c]);
         const Vec beta_vec(beta_data[c]);
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 7fe8177707b..6e58596ca30 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -1894,7 +1894,7 @@ AT_ERROR("triangular_solve: MAGMA library not found in "
   magma_int_t n = magma_int_cast(A.size(-2), "A.size(-2)");
   magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)");
   // magma returns early if m <= 0 || n <= 0 for magmaTriangularSolveBatched
-  // magmaTriangularSolve is calling cuBLAS and it prints 
+  // magmaTriangularSolve is calling cuBLAS and it prints
   // ** On entry to DTRSM  parameter number 9 had an illegal value
   // so let's use proper lda parameter here
   magma_int_t lda = std::max<magma_int_t>(1, n);
@@ -2282,7 +2282,7 @@ std::tuple<Tensor, Tensor> _syevd_helper_cuda(const Tensor& self, bool compute_e
   bool upper = uplo == 'U' ? true : false;
   return _symeig_helper_cuda(self, compute_eigenvectors, upper);
 }
-    
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
index 46df7c366e7..d60decacb79 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
@@ -50,7 +50,7 @@ static void apply_batched_inverse_lib(Tensor& self, Tensor& self_inv, Tensor& in
 
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
 
-  // Heuristic: For small batch size or large matrix size, we use for-loop to iterate over the batches instead of 
+  // Heuristic: For small batch size or large matrix size, we use for-loop to iterate over the batches instead of
   //            calling the batched cublas routine.
   if (batch_size <= 8 || /* batch_size > 8 && */ n >= 512) {
     for (int64_t i = 0; i < batch_size; i++) {
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 28fcd0bafb1..d512c5c908a 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -260,7 +260,7 @@ public:
 
 #ifdef __HIP_PLATFORM_HCC__
     // clone input to avoid issues with hipfft clobering the input and failing tests
-    clone_input = true; 
+    clone_input = true;
 #else
     clone_input = false;
 #endif
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index cdfc0d0abec..0fcd4f549a8 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -20,9 +20,9 @@ std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tenso
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<3>(tensor_lists,
-                              BinaryOpListAlphaFunctor<scalar_t, 
+                              BinaryOpListAlphaFunctor<scalar_t,
                                                        /* depth */ 3,
-                                                       /* r_args_depth */ 2, 
+                                                       /* r_args_depth */ 2,
                                                        /* res_arg_index */ 2>(),
                               Op<opmath_t>(),
                               alpha.to<opmath_t>());
@@ -40,9 +40,9 @@ void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, Scalar al
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<2>(tensor_lists,
-                              BinaryOpListAlphaFunctor<scalar_t, 
+                              BinaryOpListAlphaFunctor<scalar_t,
                                                        /* depth */ 2,
-                                                       /* r_args_depth */ 2, 
+                                                       /* r_args_depth */ 2,
                                                        /* res_arg_index */ 0>(),
                               Op<opmath_t>(),
                               alpha.to<opmath_t>());
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 43372055dbb..a617dde3b78 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -19,9 +19,9 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, Scalar scalar) {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<2>(tensor_lists,
-                              BinaryOpScalarFunctor<scalar_t, 
+                              BinaryOpScalarFunctor<scalar_t,
                                                     /* depth */ 2,
-                                                    /* r_args_depth */ 1, 
+                                                    /* r_args_depth */ 1,
                                                     /* res_arg_index */ 1>(),
                               Op<opmath_t>(),
                               scalar.to<opmath_t>());
@@ -37,9 +37,9 @@ void foreach_binary_op_(TensorList tensors, Scalar scalar) {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda_", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<1>(tensor_lists,
-                              BinaryOpScalarFunctor<scalar_t, 
+                              BinaryOpScalarFunctor<scalar_t,
                                                     /* depth */ 1,
-                                                    /* r_args_depth */ 1, 
+                                                    /* r_args_depth */ 1,
                                                     /* res_arg_index */ 0>(),
                                                     Op<opmath_t>(),
                               scalar.to<opmath_t>());
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index 66dc749c03a..97754ec374f 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -20,9 +20,9 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<Scalar> s
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<2, opmath_t>(tensor_lists,
                                         scalars,
-                                        BinaryOpScalarListFunctor<scalar_t, 
+                                        BinaryOpScalarListFunctor<scalar_t,
                                                                   /* depth */ 2,
-                                                                  /* r_args_depth */ 1, 
+                                                                  /* r_args_depth */ 1,
                                                                   /* res_arg_index */ 1>(),
 
                                         Op<opmath_t>());
@@ -39,9 +39,9 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<1, opmath_t>(tensor_lists,
                                         scalars,
-                                        BinaryOpScalarListFunctor<scalar_t, 
+                                        BinaryOpScalarListFunctor<scalar_t,
                                                                   /* depth */ 1,
-                                                                  /* r_args_depth */ 1, 
+                                                                  /* r_args_depth */ 1,
                                                                   /* res_arg_index */ 0>(),
                                         Op<opmath_t>());
     });
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index d8e6f18f417..64a620e4503 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -22,9 +22,9 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
     AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<4>(tensor_lists,
-                              PointwiseOpScalarFunctor<scalar_t, 
+                              PointwiseOpScalarFunctor<scalar_t,
                                                        /* depth */ 4,
-                                                       /* r_args_depth */ 3, 
+                                                       /* r_args_depth */ 3,
                                                        /* res_arg_index */ 3>(),
                               Op<opmath_t>(),
                               scalar.to<opmath_t>());
@@ -43,9 +43,9 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
     AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op__cuda", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<3>(tensor_lists,
-                              PointwiseOpScalarFunctor<scalar_t, 
+                              PointwiseOpScalarFunctor<scalar_t,
                                                        /* depth */ 3,
-                                                       /* r_args_depth */ 3, 
+                                                       /* r_args_depth */ 3,
                                                        /* res_arg_index */ 0>(),
                               Op<opmath_t>(),
                               scalar.to<opmath_t>());
@@ -64,9 +64,9 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<3, opmath_t>(tensor_lists,
                                         scalars,
-                                        PointwiseOpScalarListFunctor<scalar_t, 
+                                        PointwiseOpScalarListFunctor<scalar_t,
                                                                      /* depth */ 3,
-                                                                     /* r_args_depth */ 3, 
+                                                                     /* r_args_depth */ 3,
                                                                      /* res_arg_index */ 0>(),
                                         Op<opmath_t>());
     });
@@ -91,9 +91,9 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<4, opmath_t>(tensor_lists,
                                         scalars,
-                                        PointwiseOpScalarListFunctor<scalar_t, 
+                                        PointwiseOpScalarListFunctor<scalar_t,
                                                                      /* depth */ 4,
-                                                                     /* r_args_depth */ 3, 
+                                                                     /* r_args_depth */ 3,
                                                                      /* res_arg_index */ 3>(),
                                         Op<opmath_t>());
     });
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 03d6601c097..758dbb7e7a5 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -33,7 +33,7 @@ template <typename scalar_t, template<class> class Op> void foreach_unary_op_(Te
     multi_tensor_apply<1>(tensor_lists,
                           UnaryOpFunctor<scalar_t,
                                          /* depth */ 1,
-                                         /* r_args_depth */ 1, 
+                                         /* r_args_depth */ 1,
                                          /* res_arg_index */ 0>(),
                           Op<opmath_t>());
 }
@@ -230,7 +230,7 @@ void foreach_tensor_neg_cuda_(TensorList tensors) {
 }
 
 // Abs have to go via slow path in case of a complex type.
-// This is because foreach kernels can't return a different dtype than passed, while 
+// This is because foreach kernels can't return a different dtype than passed, while
 // abs with complex inputs will produce float output.
 template<typename T>
 struct Abs {
@@ -283,7 +283,7 @@ void foreach_tensor_zero_cuda_(TensorList tensors) {
         multi_tensor_apply<1>(tensor_lists,
                               ZeroFunctor<scalar_t,
                                           /* depth */ 1,
-                                          /* r_args_depth */ 1, 
+                                          /* r_args_depth */ 1,
                                           /* res_arg_index */ 0>());
     });
 }
diff --git a/aten/src/ATen/native/cuda/GridSampler.cuh b/aten/src/ATen/native/cuda/GridSampler.cuh
index 0c4acd1be41..3897f3b10a2 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cuh
+++ b/aten/src/ATen/native/cuda/GridSampler.cuh
@@ -142,14 +142,14 @@ scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high
   }
 }
 
-template<typename scalar_t> 
-static __forceinline__ __device__ 
+template<typename scalar_t>
+static __forceinline__ __device__
 scalar_t safe_downgrade_to_int_range(scalar_t x){
-  // -100.0 does not have special meaning. This is just to make sure 
-  // it's not within_bounds_2d or within_bounds_3d, and does not cause 
-  // undefined behavior. See #35506.  
-  if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast<double>(x))) 
-    return static_cast<scalar_t>(-100.0); 
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
   return x;
 }
 
@@ -219,7 +219,7 @@ scalar_t grid_sampler_compute_source_index_set_grad(
     *grad_in = (*grad_in) * grad_refl * grad_clip;
   }
 
-  coord = safe_downgrade_to_int_range(coord); 
+  coord = safe_downgrade_to_int_range(coord);
   return coord;
 }
 
@@ -244,7 +244,7 @@ scalar_t get_value_bounded(
   y = compute_coordinates(y, H, padding_mode, align_corners);
 
   int ix = static_cast<int>(x);
-  int iy = static_cast<int>(y); 
+  int iy = static_cast<int>(y);
 
   if (within_bounds_2d(iy, ix, H, W)) {
     return data[iy * sH + ix * sW];
@@ -284,7 +284,7 @@ void add_value_bounded(
   y = compute_coordinates(y, H, padding_mode, align_corners);
 
   int ix = static_cast<int>(x);
-  int iy = static_cast<int>(y); 
+  int iy = static_cast<int>(y);
 
   safe_add_2d(data, iy, ix, sH, sW, H, W, delta);
 }
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 32997a056f7..f2b0d50c3e3 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -978,7 +978,7 @@ Tensor & masked_fill__cuda(Tensor& self, const Tensor & mask, Scalar value) {
       .add_output(self)
       .add_input(self)
       .add_input(b_mask)
-      .build();  
+      .build();
 
   if (b_mask.dtype() == at::ScalarType::Byte) {
     TORCH_WARN("masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated," \
diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index 4c5718061eb..04ce9eaa8cd 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -58,7 +58,7 @@ Tensor kl_div_backward_cuda(const Tensor& grad, const Tensor& input, const Tenso
         });
     });
   }
-  else { 
+  else {
     grad_input = -at::exp(target) * grad;
     if (reduction == at::Reduction::Mean) {
       grad_input /= input.numel();
diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h
index 8baa0703d5e..2f0712e5eb5 100644
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@@ -91,7 +91,7 @@ struct MagmaStreamSyncGuard {
 
 static inline int cuda_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<int>(value);
-  TORCH_CHECK(static_cast<int64_t>(result) == value, 
+  TORCH_CHECK(static_cast<int64_t>(result) == value,
               "cuda_int_cast: The value of ", varname, "(", (long long)value,
               ") is too large to fit into a int (", sizeof(int), " bytes)");
   return result;
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index 76b353f5f7f..567b74ca43c 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -555,7 +555,7 @@ __global__ void batch_norm_backward_elemt_kernel(
     const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
     GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
     const int* __restrict__ numel, const int world_size) {
-  
+
   int64_t div = 0;
   for (int i = 0; i < world_size; i ++) {
     div += numel[i];
@@ -955,7 +955,7 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cuda_template(
 }
 
 // welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance
-// original apex name: welford_kernel_c_last 
+// original apex name: welford_kernel_c_last
 template
    <template<typename T> class VarTransform,
     typename scalar_t,
@@ -1632,7 +1632,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
     });
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
- 
+
   return grad_input;
 }
 
diff --git a/aten/src/ATen/native/cuda/PowKernel.cu b/aten/src/ATen/native/cuda/PowKernel.cu
index 3926b350db2..ebb45ef09df 100644
--- a/aten/src/ATen/native/cuda/PowKernel.cu
+++ b/aten/src/ATen/native/cuda/PowKernel.cu
@@ -24,7 +24,7 @@ namespace {
 // applied to the result of the inline function, and thus the result is incorrect.
 //   e.g. if we use 1.0 / sqrt(2) for 2 ^ (-0.5) in MSVC, we get
 //          int(2 ^ (-0.5)) = int(1.0 / sqrt(2)) = int(1.0 / int(1.414)) = int(1.0 / 1) = 1
-//        However, the correct result is 
+//        However, the correct result is
 //          int(2 ^ (-0.5)) = int(1.0 / 1.414) = 0
 #ifdef _MSC_VER
 // Functions for pow
diff --git a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
index 6839cbbe4cb..c374797a733 100644
--- a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
@@ -119,14 +119,14 @@ static void _aminmax_kernel_impl(
     const Tensor& self,
     int64_t dim,
     bool keepdim) {
-  at::TensorIterator iter = make_reduction("_aminmax", min_result, 
+  at::TensorIterator iter = make_reduction("_aminmax", min_result,
     max_result, self, dim, keepdim, self.scalar_type());
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, self.scalar_type(), "_aminmax_cuda", [&]() {
     gpu_reduce_kernel<scalar_t, scalar_t>(
       iter,
       MinMaxOps<scalar_t, scalar_t, int32_t>{},
       thrust::pair<scalar_t, scalar_t>(
-        at::numeric_limits<scalar_t>::upper_bound(), 
+        at::numeric_limits<scalar_t>::upper_bound(),
         at::numeric_limits<scalar_t>::lower_bound()
       )
     );
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index 117f86504cc..6fd648e6699 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -553,7 +553,7 @@ static inline void split_batch_dim_to_32bit_out(
     const at::Tensor& input,
     const at::Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32, 
+    bool benchmark, bool deterministic, bool allow_tf32,
     int64_t max_worksize, func_t func_32bit) {
   constexpr int64_t int_max = std::numeric_limits<int>::max();
   const int64_t ni = input.numel();
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
index c6bb5072014..75eab4a1c30 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/4x8-aarch32-neon.S
@@ -28,7 +28,7 @@
 #  |out ch indx| 16
 #  |params     | 20
 #  |-----------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r8 and d8-d15 on stack
@@ -42,7 +42,7 @@
 #  |out ch indx| 112
 #  |params     | 116
 #  |-----------|
-#  
+#
 
 # void pytorch_q8conv_ukernel_4x8__aarch32_neon(
 #     size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
index d035a7c782e..95d0a2ca8eb 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-aarch64-neon.S
@@ -16,8 +16,8 @@
 #  x2: kc
 #  x3: ks
 #  x4: a
-#  x5: w 
-#  x6: c 
+#  x5: w
+#  x6: c
 #  x7: c_stride
 #
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
index 52af6092a91..8fbea6498dc 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-aarch32-neon.S
@@ -28,7 +28,7 @@
 #  |out ch indx| 16
 #  |params     | 20
 #  |-----------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
@@ -42,7 +42,7 @@
 #  |out ch indx| 104
 #  |params     | 108
 #  |-----------|
-#  
+#
 
 #
 # New Struct for pytorch_qnnp_conv_quantization_params
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
index e17b38f5833..de564d9d3d5 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x8-dq-aarch32-neon.S
@@ -41,7 +41,7 @@
 #  |out ch indx| 16
 #  |params     | 20
 #  |-----------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r8 and d8-d15 on stack
@@ -56,7 +56,7 @@
 #  |out ch indx| 100
 #  |params     | 104
 #  |-----------|
-#  
+#
 
 # void pytorch_q8gemm_ukernel_4x8__aarch32_neon(
 #     size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
index d1e9f316bc2..52913d75286 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-aarch64-neon.S
@@ -15,8 +15,8 @@
 #  x2: k
 #  x3: a
 #  x4: a_stride
-#  x5: w 
-#  x6: c 
+#  x5: w
+#  x6: c
 #  x7: c_stride
 #
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
index 15a0d3b63f3..f1dd0a2cc05 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S
@@ -21,15 +21,15 @@
 #  |----------------|
 #  |packed_a        | 0
 #  |----------------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
 #  |----------------|
-#  |r4 - r11        | 0 
+#  |r4 - r11        | 0
 #  |packed_a        | 32
 #  |----------------|
-#  
+#
 
 # Packed A format.
 # 4kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
@@ -42,7 +42,7 @@
 # |                     |        Thus Packed A has (K + 4 - 1)/4 * (M + 4 -1)/4 blocks
 # |                     |
 # |---------------------|
-# 
+#
 # Each 4 x 4 blocks is transposed and stored.
 # Each of the (K + 4 - 1)/4 blocks for a given group of 4 m blocks
 # are stored adjacent in memory
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
index fd1ed124f5f..1d545734f6d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S
@@ -20,7 +20,7 @@
 ## Stack
 # 4     a_stride
 # 4     packed_w
-# 4     w_row_ptr 
+# 4     w_row_ptr
 # 4     w_block_ids_ptr
 # 4     b
 # 4     c
@@ -43,7 +43,7 @@
 #  |out ch indx     | 24
 #  |params          | 28
 #  |----------------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
@@ -58,7 +58,7 @@
 #  |out ch indx     | 120
 #  |params          | 124
 #  |----------------|
-#  
+#
 
 # void pytorch_q8gemm_dq_sparse_1x4_ukernel_4x8_packedA__aarch32_neon(
 #     size_t mr,
@@ -223,7 +223,7 @@ k_loop:
     # Each iteration produce 4 values each of 4 bytes
     # Thus 4 x 4 = 16 bytes 2^4
     # In this implementation, first value will be stored at
-    # 1st value: sp - 12 - r1 * 16 
+    # 1st value: sp - 12 - r1 * 16
     # 2nd value: sp - 12 - (r1 - 1) * 16
     # and so on.
     SUB r9, r9, r1, LSL #4
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
index 6952de38c23..109307d082d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S
@@ -20,7 +20,7 @@
 ## Stack
 # 4     a_stride
 # 4     packed_w
-# 4     w_row_ptr 
+# 4     w_row_ptr
 # 4     w_block_ids_ptr
 # 4     b
 # 4     c
@@ -43,7 +43,7 @@
 #  |out ch indx     | 24
 #  |params          | 28
 #  |----------------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
@@ -58,7 +58,7 @@
 #  |out ch indx     | 120
 #  |params          | 124
 #  |----------------|
-#  
+#
 
 # void pytorch_q8gemm_dq_sparse_8x1_ukernel_4x8_packedA__aarch32_neon(
 #     size_t mr,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
index 4eb6fd5f069..bff19de739b 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch32-neon.S
@@ -21,15 +21,15 @@
 #  |----------------|
 #  |packed_a        | 0
 #  |----------------|
-#  
+#
 
 #  After loading w pointer in ip reg.
 #  And after pushing r4-r9 and d8-d15 on stack
 #  |----------------|
-#  |r4 - r11        | 0 
+#  |r4 - r11        | 0
 #  |packed_a        | 32
 #  |----------------|
-#  
+#
 
 # Packed A format.
 # 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
@@ -42,7 +42,7 @@
 # |                     |        Thus Packed A has (K + 4 - 1)/4 * (M + 8 -1)/8 blocks
 # |                     |
 # |---------------------|
-# 
+#
 # Each 8 x 4 blocks is transposed and stored.
 # Each of the (K + 4 - 1)/4 blocks for a given group of 8 m blocks
 # are stored adjacent in memory
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
index aa76d47dfe4..4cd788cf583 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4-packA-aarch64-neon.S
@@ -19,7 +19,7 @@
 # |                     |        Thus Packed A has (K + 4 - 1)/4 * (M + 8 -1)/8 blocks
 # |                     |
 # |---------------------|
-# 
+#
 # Each 8 x 4 blocks is transposed and stored.
 # Each of the (K + 4 - 1)/4 blocks for a given group of 8 m blocks
 # are stored adjacent in memory
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
index db6013c6a43..375581ec3fe 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S
@@ -278,7 +278,7 @@ k_loop:
     # v10 : x10, x11, x12, x13
     # v12 : x20, x21, x22, x23
     # v14 : x30, x31, x32, x33
-    # Then using 
+    # Then using
     # TRANSPOSE_4X4_S32 v16, v18, v20, v22, v4, v5, v6, v7
     # We get
     # v16 : x04, x05, x06, x07
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index 6070faf635c..7f26acd8714 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -635,7 +635,7 @@ Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_, c10::option
   namedinference::propagate_names(result, input_);
   return result;
 }
-  
+
 Tensor _sparse_log_softmax(const Tensor& self, Dimname dim, optional<ScalarType> dtype) {
   return at::_sparse_log_softmax(self, dimname_to_position(self, dim), dtype);
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index bf4f6da7930..da99e2a65c7 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -541,8 +541,8 @@ Tensor sparse_mask_helper_cpu(
       `t`             - coalesced sparse tensor input
       `mask_indices`  - mask indices tensor
 
-    Note: The nnz in the output tensor will be same as the `mask_indices`. So it will 
-    works independently if the mask is coalesced or not. 
+    Note: The nnz in the output tensor will be same as the `mask_indices`. So it will
+    works independently if the mask is coalesced or not.
   */
   TORCH_CHECK(t.is_sparse(), "t: input is not a sparse tensor");
   TORCH_CHECK(t.is_coalesced(), "t:  input is uncoalesced");
@@ -554,7 +554,7 @@ Tensor sparse_mask_helper_cpu(
   auto t_v = t._values();
   auto vsize = t_v.sizes().vec();
   vsize[0] = r_nnz;
- 
+
   Tensor r_values = at::zeros(vsize, t_v.options());
   auto t_i = t._indices();
   auto t_nnz = t._nnz();
@@ -583,7 +583,7 @@ Tensor sparse_mask_helper_cpu(
       }
     }
   });
-  return r_values; 
+  return r_values;
 }
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index d76bb822421..c5328ef4acb 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -1116,7 +1116,7 @@ SparseTensor& _sspaddmm_out_cpu(
       "sspaddmm: Argument #1: Expected dim 1 size ", dim_k, ", got ", t.size(1));
 
   int64_t nnz        = sparse._nnz();
-  // We have to make indices contiguous as we use indices.data_ptr in _to_csr which assumes row-contiguous storage  
+  // We have to make indices contiguous as we use indices.data_ptr in _to_csr which assumes row-contiguous storage
   Tensor indices = sparse._indices().contiguous();
   Tensor values      = sparse._values();
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
index bd627091a16..cbe5dda4c23 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
@@ -144,8 +144,8 @@ void csrmm2(
   TORCH_CUDASPARSE_CHECK(cusparseCreateDnMat(
     &descC,               /* output */
     m, n, ldc,            /* rows, cols, leading dimension */
-    c,                    /* values */ 
-    cusparse_value_type,  /* data type of values */ 
+    c,                    /* values */
+    cusparse_value_type,  /* data type of values */
     CUSPARSE_ORDER_COL    /* memory layout, ONLY column-major is supported now */
   ));
 
diff --git a/aten/src/ATen/nnapi/codegen.py b/aten/src/ATen/nnapi/codegen.py
index a24823da6f7..a572d0be5d9 100755
--- a/aten/src/ATen/nnapi/codegen.py
+++ b/aten/src/ATen/nnapi/codegen.py
@@ -105,7 +105,7 @@ def main(argv):
     out_dir = pathlib.Path(__file__).parent
 
     (out_dir / "nnapi_wrapper.h").write_text(
-        PREFIX + 
+        PREFIX +
         textwrap.dedent("""\
             #ifndef NNAPI_WRAPPER_H_
             #define NNAPI_WRAPPER_H_
@@ -124,7 +124,7 @@ def main(argv):
     )
 
     (out_dir / "nnapi_wrapper.cpp").write_text(
-        PREFIX + 
+        PREFIX +
         textwrap.dedent("""\
             #ifndef _WIN32
             #include <dlfcn.h>
diff --git a/aten/src/ATen/nnapi/nnapi_bind.cpp b/aten/src/ATen/nnapi/nnapi_bind.cpp
index 9e652290ab4..81d8ecb6e05 100644
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@@ -140,7 +140,7 @@ struct NnapiCompilation : torch::jit::CustomClassHolder {
     }
 
     check_nnapi->Execution_compute(execution);
-    
+
     // TODO: Maybe skip this for fixed-size outputs?
     for (size_t i = 0; i < outputs.size(); i++) {
       auto& t = outputs[i];
diff --git a/aten/src/ATen/test/cpu_generator_test.cpp b/aten/src/ATen/test/cpu_generator_test.cpp
index e6b1aa1c671..921211cbf7b 100644
--- a/aten/src/ATen/test/cpu_generator_test.cpp
+++ b/aten/src/ATen/test/cpu_generator_test.cpp
@@ -18,7 +18,7 @@ TEST(CPUGeneratorImpl, TestGeneratorDynamicCast) {
 }
 
 TEST(CPUGeneratorImpl, TestDefaultGenerator) {
-  // Test Description: 
+  // Test Description:
   // Check if default generator is created only once
   // address of generator should be same in all calls
   auto foo = at::detail::getDefaultCPUGenerator();
@@ -27,7 +27,7 @@ TEST(CPUGeneratorImpl, TestDefaultGenerator) {
 }
 
 TEST(CPUGeneratorImpl, TestCloning) {
-  // Test Description: 
+  // Test Description:
   // Check cloning of new generators.
   // Note that we don't allow cloning of other
   // generator states into default generators.
@@ -47,9 +47,9 @@ void thread_func_get_engine_op(CPUGeneratorImpl* generator) {
 }
 
 TEST(CPUGeneratorImpl, TestMultithreadingGetEngineOperator) {
-  // Test Description: 
+  // Test Description:
   // Check CPUGeneratorImpl is reentrant and the engine state
-  // is not corrupted when multiple threads request for 
+  // is not corrupted when multiple threads request for
   // random samples.
   // See Note [Acquire lock when using random generators]
   auto gen1 = at::detail::createCPUGenerator();
@@ -74,7 +74,7 @@ TEST(CPUGeneratorImpl, TestMultithreadingGetEngineOperator) {
 }
 
 TEST(CPUGeneratorImpl, TestGetSetCurrentSeed) {
-  // Test Description: 
+  // Test Description:
   // Test current seed getter and setter
   // See Note [Acquire lock when using random generators]
   auto foo = at::detail::getDefaultCPUGenerator();
@@ -92,7 +92,7 @@ void thread_func_get_set_current_seed(Generator generator) {
 }
 
 TEST(CPUGeneratorImpl, TestMultithreadingGetSetCurrentSeed) {
-  // Test Description: 
+  // Test Description:
   // Test current seed getter and setter are thread safe
   // See Note [Acquire lock when using random generators]
   auto gen1 = at::detail::getDefaultCPUGenerator();
@@ -107,7 +107,7 @@ TEST(CPUGeneratorImpl, TestMultithreadingGetSetCurrentSeed) {
 }
 
 TEST(CPUGeneratorImpl, TestRNGForking) {
-  // Test Description: 
+  // Test Description:
   // Test that state of a generator can be frozen and
   // restored
   // See Note [Acquire lock when using random generators]
@@ -124,7 +124,7 @@ TEST(CPUGeneratorImpl, TestRNGForking) {
   ASSERT_EQ(target_value.sum().item<double>(), forked_value.sum().item<double>());
 }
 
-/** 
+/**
  * Philox CPU Engine Tests
  */
 
@@ -208,7 +208,7 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
   // Test Description:
   //   Tests if same inputs give same results when compared
   //   to std.
-  
+
   // test with zero seed
   at::mt19937 engine1(0);
   std::mt19937 engine2(0);
@@ -231,5 +231,5 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
   for(int i = 0; i < 10000; i++) {
     ASSERT_EQ(engine1(), engine2());
   }
-  
+
 }
diff --git a/aten/src/ATen/test/cuda_generator_test.cu b/aten/src/ATen/test/cuda_generator_test.cu
index f78f1fcde8e..e79a5404be7 100644
--- a/aten/src/ATen/test/cuda_generator_test.cu
+++ b/aten/src/ATen/test/cuda_generator_test.cu
@@ -80,7 +80,7 @@ __global__ void testEngineOffset2(){
   unsigned long long increment_val = ::ldexp(1.0, 64);
   at::Philox4_32_10 engine1(123, 0, increment_val);
   at::Philox4_32_10 engine2(123, increment_val, increment_val);
-  
+
   engine2.incr_n(increment_val);
   engine2.incr();
   assert(engine1() == engine2());
@@ -166,7 +166,7 @@ TEST(CUDAGeneratorImpl, TestGeneratorDynamicCast) {
 }
 
 TEST(CUDAGeneratorImpl, TestDefaultGenerator) {
-  // Test Description: 
+  // Test Description:
   // Check if default generator state is created only once
   // address of generator should be same in all calls
   if (!at::cuda::is_available()) return;
@@ -186,7 +186,7 @@ TEST(CUDAGeneratorImpl, TestDefaultGenerator) {
 }
 
 TEST(CUDAGeneratorImpl, TestCloning) {
-  // Test Description: 
+  // Test Description:
   // Check cloning of new generators.
   // Note that we don't allow cloning of other
   // generator states into default generators.
@@ -211,9 +211,9 @@ void thread_func_get_set_current_seed(Generator generator) {
   current_seed++;
   generator.set_current_seed(current_seed);
 }
-  
+
 TEST(CUDAGeneratorImpl, TestMultithreadingGetSetCurrentSeed) {
-  // Test Description: 
+  // Test Description:
   // Test current seed getter and setter are thread safe
   // See Note [Acquire lock when using random generators]
   if (!at::cuda::is_available()) return;
@@ -229,7 +229,7 @@ TEST(CUDAGeneratorImpl, TestMultithreadingGetSetCurrentSeed) {
 }
 
 TEST(CUDAGeneratorImpl, TestRNGForking) {
-  // Test Description: 
+  // Test Description:
   // Test that state of a generator can be frozen and
   // restored
   // See Note [Acquire lock when using random generators]
diff --git a/aten/src/TH/vector/VSX.cpp b/aten/src/TH/vector/VSX.cpp
index e4646ad2976..1925627d090 100644
--- a/aten/src/TH/vector/VSX.cpp
+++ b/aten/src/TH/vector/VSX.cpp
@@ -765,7 +765,7 @@ int main()
 
     test_THDoubleVector_fill_VSX();
     test_THFloatVector_fill_VSX();
- 
+
     test_THDoubleVector_muls_VSX();
     test_THFloatVector_muls_VSX();
 
diff --git a/aten/src/TH/vector/simd.h b/aten/src/TH/vector/simd.h
index a2fa3d466ed..c5862b08911 100644
--- a/aten/src/TH/vector/simd.h
+++ b/aten/src/TH/vector/simd.h
@@ -93,7 +93,7 @@ static inline uint32_t detectHostSIMDExtensions()
 }
 
  #endif
- 
+
 #elif defined(__EMSCRIPTEN__)
 
 static inline uint32_t detectHostSIMDExtensions()
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index eb1a4f34d36..314451136f3 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -294,9 +294,9 @@ inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
                                       return THCNumerics<at::BFloat16>::mul(bsum, val);
-                                    });    
+                                    });
 }
-    
+
 inline __device__ double gpuAtomicMul(double * address, double val) {
   return AtomicFPOp<double>()(address, val,
                               [](double val, unsigned long long int assumed) {
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index 06bc9b05517..36ae6dc0fad 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -270,7 +270,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
     te::For* tail;
     loop.splitWithTail(m, kChunkSize, &mo, &mi, &tail);
   }
-  
+
   loop.prepareForCodegen();
   te::Stmt* s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
@@ -313,7 +313,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
     te::For* mi;
     loop.splitWithMask(m, kChunkSize, &mo, &mi);
   }
-  
+
   loop.prepareForCodegen();
   te::Stmt* s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
@@ -369,7 +369,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
     auto bt_body = te::NodeFinder<te::ReduceOp>::find(loop.root_stmt())[0];
     loop.rfactor(bt_body, mi->var());
   }
-  
+
   loop.prepareForCodegen();
   te::Stmt* s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
@@ -419,7 +419,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV2)(benchmark::State& state) {
 
   {
     // Look for the new For and vectorize, but rfactor didn't return the newly added "For *".
-    // Resort to a hack to find the lost "For *". 
+    // Resort to a hack to find the lost "For *".
     // TODO: make it easier to find the transformed loop after rfactor.
     auto loops = te::NodeFinder<te::For>::find(loop.root_stmt());
     TORCH_CHECK(loops.size() == 4);
diff --git a/benchmarks/distributed/rpc/rl/agent.py b/benchmarks/distributed/rpc/rl/agent.py
index 4f55bdef849..9fdacbf348a 100644
--- a/benchmarks/distributed/rpc/rl/agent.py
+++ b/benchmarks/distributed/rpc/rl/agent.py
@@ -163,7 +163,7 @@ class AgentBase:
         r"""
         Finishes the episode
         Args:
-            rets (list): List containing rewards generated by selct action calls during 
+            rets (list): List containing rewards generated by selct action calls during
             episode run
         """
         return self.agent_latency, self.agent_throughput
diff --git a/benchmarks/distributed/rpc/rl/coordinator.py b/benchmarks/distributed/rpc/rl/coordinator.py
index 1b53fe4ac00..b488378d5ae 100644
--- a/benchmarks/distributed/rpc/rl/coordinator.py
+++ b/benchmarks/distributed/rpc/rl/coordinator.py
@@ -48,13 +48,13 @@ class CoordinatorBase:
 
     def run_coordinator(self, episodes, episode_steps, queue):
         r"""
-            Runs n benchmark episodes.  Each episode is started by coordinator telling each 
-            observer to contact the agent.  Each episode is concluded by coordinator telling agent 
+            Runs n benchmark episodes.  Each episode is started by coordinator telling each
+            observer to contact the agent.  Each episode is concluded by coordinator telling agent
             to finish the episode, and then the coordinator records benchmark data
             Args:
                 episodes (int): Number of episodes to run
                 episode_steps (int): Number steps to be run in each episdoe by each observer
-                queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for 
+                queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
                                      saving benchmark run results to
         """
 
@@ -96,9 +96,9 @@ class CoordinatorBase:
         observer_throughput_final = [
             t for s in observer_throughput_final for t in s]
 
-        benchmark_metrics = {'agent latency (seconds)': {}, 
-                             'agent throughput': {}, 
-                             'observer latency (seconds)': {}, 
+        benchmark_metrics = {'agent latency (seconds)': {},
+                             'agent throughput': {},
+                             'observer latency (seconds)': {},
                              'observer throughput': {}}
 
 
diff --git a/benchmarks/distributed/rpc/rl/launcher.py b/benchmarks/distributed/rpc/rl/launcher.py
index 5a612aab0e9..8905378eb9b 100644
--- a/benchmarks/distributed/rpc/rl/launcher.py
+++ b/benchmarks/distributed/rpc/rl/launcher.py
@@ -44,19 +44,19 @@ args = vars(args)
 
 def run_worker(rank, world_size, master_addr, master_port, batch, state_size, nlayers, out_features, queue):
     r"""
-    inits an rpc worker 
+    inits an rpc worker
     Args:
         rank (int): Rpc rank of worker machine
         world_size (int): Number of workers in rpc network (number of observers +
                           1 agent + 1 coordinator)
         master_addr (str): Master address of cooridator
         master_port (str): Master port of coordinator
-        batch (bool): Whether agent will use batching or process one observer 
+        batch (bool): Whether agent will use batching or process one observer
                       request a at a time
         state_size (str): Numerical str representing state dimensions (ie: 5-15-10)
         nlayers (int): Number of layers in model
         out_features (int): Number of out features in model
-        queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for 
+        queue (SimpleQueue): SimpleQueue from torch.multiprocessing.get_context() for
                              saving benchmark run results to
     """
     state_size = list(map(int, state_size.split('-')))
@@ -82,9 +82,9 @@ def find_graph_variable(args):
     r"""
     Determines if user specified multiple entries for a single argument, in which case
     benchmark is run for each of these entries.  Comma separated values in a given argument indicate multiple entries.
-    Output is presented so that user can use plot repo to plot the results with each of the 
-    variable argument's entries on the x-axis. Args is modified in accordance with this.  
-    More than 1 argument with multiple entries is not permitted.  
+    Output is presented so that user can use plot repo to plot the results with each of the
+    variable argument's entries on the x-axis. Args is modified in accordance with this.
+    More than 1 argument with multiple entries is not permitted.
     Args:
         args (dict): Dictionary containing arguments passed by the user (and default arguments)
     """
@@ -138,12 +138,12 @@ def print_benchmark_results(report):
     if x_axis_name:
         x_axis_output_label = f'{x_axis_name} |'
         heading += append_spaces(x_axis_output_label, col_width)
-    metric_headers = ['agent latency (seconds)', 'agent throughput', 
+    metric_headers = ['agent latency (seconds)', 'agent throughput',
                       'observer latency (seconds)', 'observer throughput']
     percentile_subheaders = ['p50', 'p75', 'p90', 'p95']
     subheading = ""
     if x_axis_name:
-        subheading += append_spaces(' ' * (len(x_axis_output_label) - 1), col_width) 
+        subheading += append_spaces(' ' * (len(x_axis_output_label) - 1), col_width)
     for header in metric_headers:
         heading += append_spaces(header, col_width * len(percentile_subheaders))
         for percentile in percentile_subheaders:
@@ -163,7 +163,7 @@ def print_benchmark_results(report):
 
 def main():
     r"""
-    Runs rpc benchmark once if no argument has multiple entries, and otherwise once for each of the multiple entries. 
+    Runs rpc benchmark once if no argument has multiple entries, and otherwise once for each of the multiple entries.
     Multiple entries is indicated by comma separated values, and may only be done for a single argument.
     Results are printed as well as saved to output file.  In case of multiple entries for a single argument,
     the plot repo can be used to benchmark results on the y axis with each entry on the x axis.
@@ -171,7 +171,7 @@ def main():
     find_graph_variable(args)
 
     # run once if no x axis variables
-    x_axis_variables = args[args['x_axis_name']] if args.get('x_axis_name') else [None]  
+    x_axis_variables = args[args['x_axis_name']] if args.get('x_axis_name') else [None]
     ctx = mp.get_context('spawn')
     queue = ctx.SimpleQueue()
     benchmark_runs = []
@@ -197,7 +197,7 @@ def main():
         print(f"Time taken benchmark run {i} -, {time.time() - start_time}")
         if args.get('x_axis_name'):
             # save x axis value was for this iteration in the results
-            benchmark_run_results[args['x_axis_name']] = x_axis_variable  
+            benchmark_run_results[args['x_axis_name']] = x_axis_variable
         benchmark_runs.append(benchmark_run_results)
 
     report = args
diff --git a/benchmarks/fastrnns/README.md b/benchmarks/fastrnns/README.md
index 8443f95d5f8..830190e2833 100644
--- a/benchmarks/fastrnns/README.md
+++ b/benchmarks/fastrnns/README.md
@@ -1,6 +1,6 @@
 # Fast RNN benchmarks
 
-Benchmarks for TorchScript models 
+Benchmarks for TorchScript models
 
 For most stable results, do the following:
 - Set CPU Governor to performance mode (as opposed to energy save)
@@ -24,7 +24,7 @@ or run the test independently:
 
 should give a good comparison, or you can specify the type of model to run
 
-`python -m fastrnns.bench --rnns cudnn aten jit --group rnns` 
+`python -m fastrnns.bench --rnns cudnn aten jit --group rnns`
 
 ## Run model profiling, calls nvprof
 
@@ -33,7 +33,7 @@ should give a good comparison, or you can specify the type of model to run
 should generate nvprof file for all models somewhere.
 you can also specify the models to generate nvprof files separately:
 
-`python -m fastrnns.profile --rnns aten jit` 
+`python -m fastrnns.profile --rnns aten jit`
 
 ### Caveats
 
diff --git a/benchmarks/operator_benchmark/c2/add_test.py b/benchmarks/operator_benchmark/c2/add_test.py
index 71c90350474..401d6a88155 100644
--- a/benchmarks/operator_benchmark/c2/add_test.py
+++ b/benchmarks/operator_benchmark/c2/add_test.py
@@ -1,16 +1,16 @@
 import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
 from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
-from caffe2.python import core 
+from caffe2.python import core
 
 
 """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
 
-# Configs for C2 add operator 
+# Configs for C2 add operator
 add_long_configs = op_bench.cross_product_configs(
     M=[8, 64, 128],
     N=range(2, 10, 3),
-    K=[2 ** x for x in range(0, 3)], 
+    K=[2 ** x for x in range(0, 3)],
     dtype=["int", "float"],
     tags=["long"]
 )
@@ -22,20 +22,20 @@ add_short_configs = op_bench.config_list(
         [16, 16, 64, "float"],
         [64, 64, 128, "int"],
     ],
-    attr_names=["M", "N", "K", "dtype"], 
-    tags=["short"], 
+    attr_names=["M", "N", "K", "dtype"],
+    tags=["short"],
 )
 
 class AddBenchmark(op_bench_c2.Caffe2BenchmarkBase):
-    def init(self, M, N, K, dtype): 
-        self.input_one = self.tensor([M, N, K], dtype) 
-        self.input_two = self.tensor([M, N, K], dtype) 
+    def init(self, M, N, K, dtype):
+        self.input_one = self.tensor([M, N, K], dtype)
+        self.input_two = self.tensor([M, N, K], dtype)
         self.output = self.tensor([M, N, K], dtype)
         self.set_module_name("add")
 
     def forward(self):
         op = core.CreateOperator(
-            "Add", [self.input_one, self.input_two], self.output, **self.args 
+            "Add", [self.input_one, self.input_two], self.output, **self.args
         )
         return op
 
diff --git a/benchmarks/operator_benchmark/c2/matmul_test.py b/benchmarks/operator_benchmark/c2/matmul_test.py
index 0a4a41f1ca2..ee37a23b50f 100644
--- a/benchmarks/operator_benchmark/c2/matmul_test.py
+++ b/benchmarks/operator_benchmark/c2/matmul_test.py
@@ -2,7 +2,7 @@
 import operator_benchmark as op_bench
 import benchmark_caffe2 as op_bench_c2
 from benchmark_caffe2 import Caffe2BenchmarkBase # noqa
-from caffe2.python import core 
+from caffe2.python import core
 
 """Microbenchmarks for MatMul operator"""
 
@@ -10,7 +10,7 @@ from caffe2.python import core
 mm_long_configs = op_bench.cross_product_configs(
     M=[8, 64, 128],
     N=range(2, 10, 3),
-    K=[2 ** x for x in range(0, 3)], 
+    K=[2 ** x for x in range(0, 3)],
     trans_a=[True, False],
     trans_b=[True, False],
     tags=["long"]
@@ -23,13 +23,13 @@ mm_short_configs = op_bench.config_list(
         [1024, 1024, 256, True, False],
         [8192, 8192, 1024, True, False],
     ],
-    attr_names=["M", "N", "K", "trans_a", "trans_b"], 
-    tags=["short"], 
+    attr_names=["M", "N", "K", "trans_a", "trans_b"],
+    tags=["short"],
 )
 
 
 class MatMulBenchmark(op_bench_c2.Caffe2BenchmarkBase):
-    def init(self, M, N, K, trans_a, trans_b): 
+    def init(self, M, N, K, trans_a, trans_b):
         self.input_one = self.tensor([N, M]) if trans_a else self.tensor([M, N])
         self.input_two = self.tensor([K, N]) if trans_b else self.tensor([N, K])
         self.args = {'trans_a': trans_a, 'trans_b': trans_b}
@@ -38,7 +38,7 @@ class MatMulBenchmark(op_bench_c2.Caffe2BenchmarkBase):
 
     def forward(self):
         op = core.CreateOperator(
-            "MatMul", [self.input_one, self.input_two], self.output, **self.args 
+            "MatMul", [self.input_one, self.input_two], self.output, **self.args
         )
         return op
 
diff --git a/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py b/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py
index cd6ee94ecec..2d47c46281e 100644
--- a/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py
+++ b/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py
@@ -22,7 +22,7 @@ unary_ops_list = op_bench.op_list(
 
 
 class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, op_func): 
+    def init(self, M, N, op_func):
         self.input_one = torch.rand(M, N)
         self.op_func = op_func
 
diff --git a/benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py b/benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py
index bbad71f6474..7c2444961b3 100644
--- a/benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/c2_cpu_gpu_forward_backward_test.py
@@ -1,5 +1,5 @@
 import operator_benchmark as op_bench
-from caffe2.python import core 
+from caffe2.python import core
 
 
 add_configs = op_bench.cross_product_configs(
@@ -11,24 +11,24 @@ add_configs = op_bench.cross_product_configs(
 )
 
 class AddBenchmark(op_bench.Caffe2BenchmarkBase):
-    def init(self, M, N, K, device): 
+    def init(self, M, N, K, device):
         self.set_module_name("add")
-        self.input_one = self.tensor([M, N, K], device=device) 
-        self.input_two = self.tensor([M, N, K], device=device) 
-        self.input_one_grad = self.tensor([M, N, K], device=device) 
-        self.input_two_grad = self.tensor([M, N, K], device=device) 
+        self.input_one = self.tensor([M, N, K], device=device)
+        self.input_two = self.tensor([M, N, K], device=device)
+        self.input_one_grad = self.tensor([M, N, K], device=device)
+        self.input_two_grad = self.tensor([M, N, K], device=device)
         self.output = self.tensor([M, N, K], device=device)
 
     def forward(self):
         op = core.CreateOperator(
-            "Add", [self.input_one, self.input_two], self.output, **self.args 
+            "Add", [self.input_one, self.input_two], self.output, **self.args
         )
         return op
 
     def backward(self):
         grad_op = core.CreateOperator(
-            "AddGradient", [self.output, self.input_one, self.input_two], 
-            [self.input_one_grad, self.input_two_grad], **self.args 
+            "AddGradient", [self.output, self.input_one, self.input_two],
+            [self.input_one_grad, self.input_two_grad], **self.args
         )
         return grad_op
 
diff --git a/benchmarks/operator_benchmark/common/tests/jit_forward_test.py b/benchmarks/operator_benchmark/common/tests/jit_forward_test.py
index 758ea89f460..7643d60c76b 100644
--- a/benchmarks/operator_benchmark/common/tests/jit_forward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/jit_forward_test.py
@@ -5,8 +5,8 @@ intraop_bench_configs = op_bench.config_list(
     attrs=[
         [8, 16],
     ],
-    attr_names=["M", "N"], 
-    tags=["short"], 
+    attr_names=["M", "N"],
+    tags=["short"],
 )
 
 @torch.jit.script
@@ -24,9 +24,9 @@ class TorchSumBenchmark(op_bench.TorchBenchmarkBase):
         self.input_one = torch.rand(M, N)
         self.set_module_name("sum")
 
-    # This is a very temporary method and will be removed soon, so 
+    # This is a very temporary method and will be removed soon, so
     # don't use this method in your benchmark
-    # TODO(mingzhe): use one forward method for both JIT and Eager 
+    # TODO(mingzhe): use one forward method for both JIT and Eager
     def jit_forward(self, iters):
         return torch_sumall(self.input_one, iters)
 
diff --git a/benchmarks/operator_benchmark/common/tests/pt_backward_test.py b/benchmarks/operator_benchmark/common/tests/pt_backward_test.py
index 630c2236a68..e90a5b0ec4e 100644
--- a/benchmarks/operator_benchmark/common/tests/pt_backward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_backward_test.py
@@ -10,9 +10,9 @@ add_configs = op_bench.cross_product_configs(
 )
 
 # This benchmark uses the auto_set to automatically set requires_grad
-# for both inputs. The test name can also be used for filtering. 
+# for both inputs. The test name can also be used for filtering.
 class AddBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K): 
+    def init(self, M, N, K):
         self.input_one = torch.rand(M, N, K, requires_grad=self.auto_set())
         self.input_two = torch.rand(M, N, K, requires_grad=self.auto_set())
         self.set_module_name("add")
diff --git a/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py b/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
index afde5a96d3f..330ff4a9040 100644
--- a/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
@@ -4,7 +4,7 @@ import torch
 """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
 
 add_short_configs = op_bench.config_list(
-    attr_names=['M', 'N', 'K'], 
+    attr_names=['M', 'N', 'K'],
     attrs=[
         [8, 16, 32],
         [16, 16, 64],
@@ -14,12 +14,12 @@ add_short_configs = op_bench.config_list(
         'device': ['cpu', 'cuda'],
         'dtype': [torch.float, torch.float64],
     },
-    tags=['short'], 
+    tags=['short'],
 )
 
 
 class AddBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, device, dtype): 
+    def init(self, M, N, K, device, dtype):
         self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True)
         self.input_two = torch.rand(M, N, K, device=device, dtype=dtype)
         self.set_module_name('add')
diff --git a/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py b/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py
index b338560af85..d2e1478ddf8 100644
--- a/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py
@@ -12,7 +12,7 @@ add_configs = op_bench.cross_product_configs(
 
 
 class AddBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, device): 
+    def init(self, M, N, K, device):
         self.input_one = torch.rand(M, N, K, device=device, requires_grad=True)
         self.input_two = torch.rand(M, N, K, device=device, requires_grad=True)
         self.set_module_name("add")
diff --git a/benchmarks/operator_benchmark/common/tests/random_sample_test.py b/benchmarks/operator_benchmark/common/tests/random_sample_test.py
index b6c4e6a2b7c..92b0d96cf69 100644
--- a/benchmarks/operator_benchmark/common/tests/random_sample_test.py
+++ b/benchmarks/operator_benchmark/common/tests/random_sample_test.py
@@ -7,19 +7,19 @@ configs = op_bench.random_sample_configs(
     N=[7, 8, 9, 10, 11, 12],
     K=[13, 14, 15, 16, 17, 18],
     # probs saves the weights of each value
-    probs=op_bench.attr_probs( 
+    probs=op_bench.attr_probs(
         M=[0.5, 0.2, 0.1, 0.05, 0.03, 0.1],
         N=[0.1, 0.3, 0.4, 0.02, 0.03, 0.04],
         K=[0.03, 0.6, 0.04, 0.02, 0.03, 0.01],
     ),
-    # this is the number of returned inputs 
-    total_samples=10, 
+    # this is the number of returned inputs
+    total_samples=10,
     tags=["short"],
 )
 
 
 class AddBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K): 
+    def init(self, M, N, K):
         self.input_one = torch.rand(M, N, K)
         self.input_two = torch.rand(M, N, K)
         self.set_module_name("add")
diff --git a/benchmarks/sparse/matmul_dlmc_bench.py b/benchmarks/sparse/matmul_dlmc_bench.py
index 6112b6105e6..ef8ae6965cf 100644
--- a/benchmarks/sparse/matmul_dlmc_bench.py
+++ b/benchmarks/sparse/matmul_dlmc_bench.py
@@ -1,9 +1,9 @@
 # Sparse benchmarks
 
-# These benchmarks are for the sparse matrix functionality. 
+# These benchmarks are for the sparse matrix functionality.
 # They exist for comparing the performance of sparse matrix routines
 # torch.sparse.mm(sparse, sparse)` with different backends (CPU/CUDA)
-# and with other frameworks such as scipy. 
+# and with other frameworks such as scipy.
 
 import sys
 from scipy import sparse
diff --git a/benchmarks/sparse/test.sh b/benchmarks/sparse/test.sh
index d7a3bc667b6..313a9055dae 100644
--- a/benchmarks/sparse/test.sh
+++ b/benchmarks/sparse/test.sh
@@ -3,9 +3,9 @@
 DATASET_ROOT_DIR=$HOME/datasets/
 
 # wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR
-# tar -xvf $DATASET_ROOT_DIR/dlmc.tar.gz 
+# tar -xvf $DATASET_ROOT_DIR/dlmc.tar.gz
 
-echo "!! SPARSE SPMS TIME BENCHMARK!! " 
+echo "!! SPARSE SPMS TIME BENCHMARK!! "
 
 python matmul_dlmc_bench.py --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset random_pruning --operation matmul --output /tmp/matmul_bench.pkl
 python matmul_dlmc_bench.py --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset random_pruning --operation backward --output /tmp/backward_bench.pkl
diff --git a/benchmarks/tensorexpr/benchmark.py b/benchmarks/tensorexpr/benchmark.py
index 6c9b91bc8ec..f37d0a7e5c1 100644
--- a/benchmarks/tensorexpr/benchmark.py
+++ b/benchmarks/tensorexpr/benchmark.py
@@ -242,7 +242,7 @@ class DynamicShape(object):
     r'''
     An Auxiliary class for dynamic shape benchmarks
 
-    Pre-computes input with random shapes and also 
+    Pre-computes input with random shapes and also
     modifies the compute method so in each call the
     fuser sees a different input tensor shape
     '''
diff --git a/binaries/lite_interpreter_model_load.cc b/binaries/lite_interpreter_model_load.cc
index 2fc25ac4b87..e82d85b96db 100644
--- a/binaries/lite_interpreter_model_load.cc
+++ b/binaries/lite_interpreter_model_load.cc
@@ -24,7 +24,7 @@ int main(int argc, char** argv) {
     std::cerr << FLAGS_model <<  ":Model file is not provided\n";
     return -1;
   }
-  
+
   // TODO: avoid having to set this guard for custom mobile build with mobile
   // interpreter.
   torch::AutoNonVariableTypeMode non_var_guard{true};
diff --git a/binaries/make_mnist_db.cc b/binaries/make_mnist_db.cc
index 44a310f0681..005c9ac7d81 100644
--- a/binaries/make_mnist_db.cc
+++ b/binaries/make_mnist_db.cc
@@ -64,7 +64,7 @@ void convert_dataset(const char* image_filename, const char* label_filename,
   image_file.read(reinterpret_cast<char*>(&magic), 4);
   magic = swap_endian(magic);
   if (magic == 529205256) {
-    LOG(FATAL) << 
+    LOG(FATAL) <<
         "It seems that you forgot to unzip the mnist dataset. You should "
         "first unzip them using e.g. gunzip on Linux.";
   }
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index 86860745361..d64eb53a925 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -56,12 +56,12 @@ static uint64_t readURandomLong()
  * Gets a non deterministic random number number from either the
  * /dev/urandom or the current time. For CUDA, gets random from
  * std::random_device and adds a transformation on it.
- * 
+ *
  * FIXME: The behavior in this function is from legacy code (THRandom_seed/THCRandom_seed)
  * and is probably not the right thing to do, even though our tests pass.
  * Figure out if tests get perturbed
  * - when the same algorithm is used for all backends. Note that the current behavior is
- *   different for CPU, CUDA and Windows CPU. 
+ *   different for CPU, CUDA and Windows CPU.
  * - when using C++11 std objects, such as std::random_device
  * - when constructing a 64 bit seed properly, rather than static casting
  *   a 32 bit number to 64 bit.
diff --git a/c10/test/util/complex_test_common.h b/c10/test/util/complex_test_common.h
index 4b3a72c2229..7c478f6d5e6 100644
--- a/c10/test/util/complex_test_common.h
+++ b/c10/test/util/complex_test_common.h
@@ -53,7 +53,7 @@ TEST(TestMemory, ReinterpretCast) {
   ASSERT_EQ(zz.real(), double(1));
   ASSERT_EQ(zz.imag(), double(2));
   }
-  
+
   {
   c10::complex<double> z(3, 4);
   std::complex<double> zz = *reinterpret_cast<std::complex<double>*>(&z);
@@ -84,7 +84,7 @@ TEST(TestMemory, ThrustReinterpretCast) {
   ASSERT_EQ(zz.real(), double(1));
   ASSERT_EQ(zz.imag(), double(2));
   }
-  
+
   {
   c10::complex<double> z(3, 4);
   thrust::complex<double> zz = *reinterpret_cast<thrust::complex<double>*>(&z);
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index 964146be05e..3e67169345c 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -108,7 +108,7 @@ private:
   friend bool operator==(bitset lhs, bitset rhs) noexcept {
     return lhs.bitset_ == rhs.bitset_;
   }
-  
+
   bitset_type bitset_;
 };
 
diff --git a/caffe2/core/common_test.cc b/caffe2/core/common_test.cc
index dfada6dc7d2..4e9b65828a8 100644
--- a/caffe2/core/common_test.cc
+++ b/caffe2/core/common_test.cc
@@ -10,7 +10,7 @@ namespace caffe2 {
 
 #ifndef __ANDROID__
 
-// Simple tests to make sure that our stoi and stod implementations are 
+// Simple tests to make sure that our stoi and stod implementations are
 // matching the std implementations, but not testing it very extensively
 // as one should be using the std version most of the time.
 TEST(CommonTest, TestStoi) {
diff --git a/caffe2/ideep/operators/sigmoid_op.cc b/caffe2/ideep/operators/sigmoid_op.cc
index da846a54105..ea498a25b2e 100644
--- a/caffe2/ideep/operators/sigmoid_op.cc
+++ b/caffe2/ideep/operators/sigmoid_op.cc
@@ -25,7 +25,7 @@ class IDEEPSigmoidOp final : public IDEEPOperator {
   }
 
  private:
-  
+
   INPUT_TAGS(INPUT);
   OUTPUT_TAGS(OUTPUT);
 };
diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
index 8f3b1dd6c02..838d103b4ce 100644
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@@ -47,7 +47,7 @@ template <>
 bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(PREDICTION);
   auto& label = Input(LABEL);
-  
+
   CAFFE_ENFORCE_EQ(X.dim(), 2);
   int N = X.dim32(0);
   int D = X.dim32(1);
diff --git a/caffe2/operators/affine_channel_op.cu b/caffe2/operators/affine_channel_op.cu
index 7205e1fda06..adf4ac55c0f 100644
--- a/caffe2/operators/affine_channel_op.cu
+++ b/caffe2/operators/affine_channel_op.cu
@@ -55,7 +55,7 @@ template <>
 bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& dY = Input(0);
   const auto& scale = is_learnable_ ? Input(2) : Input(1);
-  
+
   auto* dX = Output(0, dY.sizes(), at::dtype<float>());
   const int N = dY.dim32(0);
   const int C = dY.dim32(1);
@@ -76,8 +76,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   if (is_learnable_) {
     const auto& X = Input(1);
     const float* X_data = X.data<float>();
-    
-    
+
+
     auto* dscale = Output(1, scale.sizes(), at::dtype<float>());
     auto* dbias = Output(2, scale.sizes(), at::dtype<float>());
     const int outer_size = N * HxW;
@@ -102,7 +102,7 @@ template <>
 bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto& dY = Input(0);
   const auto& scale = is_learnable_ ? Input(2) : Input(1);
-  
+
   auto* dX = Output(0, dY.sizes(), at::dtype<float>());
   const int ndim = dY.dim();
   const int C = dY.dim32(ndim - 1);
@@ -122,8 +122,8 @@ bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
     const float* X_data = X.data<float>();
     const int N = X.dim32(0);
     const int HxW = rows / N;
-    
-    
+
+
     auto* dscale = Output(1, scale.sizes(), at::dtype<float>());
     auto* dbias = Output(2, scale.sizes(), at::dtype<float>());
     AffineChannelScaleBiasBackwardCUDAKernel<float, StorageOrder::NHWC>
diff --git a/caffe2/operators/ceil_op.cu b/caffe2/operators/ceil_op.cu
index cfb6c07ad98..5b76a916ce1 100644
--- a/caffe2/operators/ceil_op.cu
+++ b/caffe2/operators/ceil_op.cu
@@ -14,7 +14,7 @@ __global__ void CeilKernel(const int N, const T* X, T* Y) {
 template <>
 bool CeilOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
-  
+
   CAFFE_ENFORCE_GT(X.numel(), 0);
   auto* Y = Output(0, X.sizes(), at::dtype<float>());
   CeilKernel<<<
diff --git a/caffe2/operators/channel_shuffle_op.cu b/caffe2/operators/channel_shuffle_op.cu
index 80f963f8370..e4f5dd427ac 100644
--- a/caffe2/operators/channel_shuffle_op.cu
+++ b/caffe2/operators/channel_shuffle_op.cu
@@ -55,7 +55,7 @@ ChannelShuffleNHWCKernel(const int G, const int K, const T* X, T* Y) {
 template <>
 bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& X = Input(0);
-  
+
   auto* Y = Output(0, X.sizes(), at::dtype<float>());
   const int N = X.dim32(0);
   const int C = X.dim32(1);
@@ -88,7 +88,7 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
 template <>
 bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto& X = Input(0);
-  
+
   auto* Y = Output(0, X.sizes(), at::dtype<float>());
   const int ndim = X.dim();
   const int N = X.dim32(0);
@@ -130,7 +130,7 @@ bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
 template <>
 bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& dY = Input(0);
-  
+
   auto* dX = Output(0, dY.sizes(), at::dtype<float>());
   const int N = dY.dim32(0);
   const int C = dY.dim32(1);
@@ -163,7 +163,7 @@ bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
 template <>
 bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto& dY = Input(0);
-  
+
   auto* dX = Output(0, dY.sizes(), at::dtype<float>());
   const int ndim = dY.dim();
   const int N = dY.dim32(0);
diff --git a/caffe2/operators/clip_op.cu b/caffe2/operators/clip_op.cu
index 92cecde46aa..02d58822a72 100644
--- a/caffe2/operators/clip_op.cu
+++ b/caffe2/operators/clip_op.cu
@@ -43,7 +43,7 @@ __global__ void ClipGradientKernel(const int N,  const T minval,
 template <>
 bool ClipOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
-  
+
   CAFFE_ENFORCE_GE(X.numel(), 0);
   auto* Y = Output(0, X.sizes(), at::dtype<float>());
   ClipKernel<<<
@@ -61,7 +61,7 @@ template <>
 bool ClipGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& Y = Input(0);
   auto& dY = Input(1);
-  
+
   CAFFE_ENFORCE_GE(Y.numel(), 0);
   CAFFE_ENFORCE_EQ(dY.numel(), Y.numel());
   auto* dX = Output(0, Y.sizes(), at::dtype<float>());
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cu b/caffe2/operators/cosine_embedding_criterion_op.cu
index 90a1da5501f..fe85133ac8c 100644
--- a/caffe2/operators/cosine_embedding_criterion_op.cu
+++ b/caffe2/operators/cosine_embedding_criterion_op.cu
@@ -26,7 +26,7 @@ template <>
 bool CosineEmbeddingCriterionOp<CUDAContext>::RunOnDevice() {
   auto& S = Input(0);
   auto& Y = Input(1);
-  
+
   CAFFE_ENFORCE(S.numel() == Y.numel(),
                 "The embedding and label should have the same size.");
   auto* output = Output(0, S.sizes(), at::dtype<float>());
@@ -48,7 +48,7 @@ bool CosineEmbeddingCriterionGradientOp<CUDAContext>::RunOnDevice() {
   auto& S = Input(0);
   auto& Y = Input(1);
   auto& dOutput = Input(2);
-  
+
 
   auto* dS = Output(0, S.sizes(), at::dtype<float>());
 
diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
index 585e1262835..cc49115bffc 100644
--- a/caffe2/operators/elementwise_linear_op.cu
+++ b/caffe2/operators/elementwise_linear_op.cu
@@ -54,7 +54,7 @@ bool ElementwiseLinearOp<float, CUDAContext>::RunOnDevice(){
   const auto& X = Input(0);
   const auto& a = Input(1);
   const auto& b = Input(2);
-  
+
 
   const auto canonical_axis = X.canonical_axis_index(axis_);
   const int N = X.size_to_dim(canonical_axis);
@@ -97,9 +97,9 @@ bool ElementwiseLinearGradientOp<float, CUDAContext>::RunOnDevice(){
   CAFFE_ENFORCE_EQ(a.dim(), 1, a.dim());
   CAFFE_ENFORCE_EQ(a.dim(0), D, a.dim());
 
-  
-  
-  
+
+
+
   auto* g_X = Output(0, X.sizes(), at::dtype<float>());
   auto * g_a = Output(1, a.sizes(), at::dtype<float>());
   auto * g_b = Output(2, a.sizes(), at::dtype<float>());
diff --git a/caffe2/operators/hip/conv_op_miopen.hip b/caffe2/operators/hip/conv_op_miopen.hip
index a856390082b..c4618cfa3f4 100644
--- a/caffe2/operators/hip/conv_op_miopen.hip
+++ b/caffe2/operators/hip/conv_op_miopen.hip
@@ -278,7 +278,7 @@ bool MIOPENConvOp::DoRunWithType() {
             weight_dims.data(),
             nullptr));
     }
-    
+
     mio_output_dims_ = Y->sizes().vec();
     std::vector<int> output_dims(std::begin(mio_output_dims_), std::end(mio_output_dims_));
     int output_dim_size = output_dims.size();
@@ -288,7 +288,7 @@ bool MIOPENConvOp::DoRunWithType() {
           weight_desc_,
           &output_dim_size,
           output_dims.data()));
-          
+
     MIOPEN_ENFORCE(miopenSetTensorDescriptor(
           top_desc_, miopenTypeWrapper<T_X>::type, output_dims.size(), output_dims.data(), nullptr));
 
diff --git a/caffe2/operators/hip/pool_op_miopen.hip b/caffe2/operators/hip/pool_op_miopen.hip
index 68a4ab2406a..8eef417d8fc 100644
--- a/caffe2/operators/hip/pool_op_miopen.hip
+++ b/caffe2/operators/hip/pool_op_miopen.hip
@@ -107,7 +107,7 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> {
                     pad_l(),
                     stride_h(),
                     stride_w()));
-    
+
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(top_desc_, miopenTypeWrapper<T>::type, N_out, C_out, H_out, W_out));
     MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(bottom_desc_, miopenTypeWrapper<T>::type, N, C, H, W));
 #endif
@@ -237,11 +237,11 @@ class MIOPENPoolGradientOp : public ConvPoolOpBase<HIPContext> {
         kernel_.data(),
         pads_.data(),
         stride_.data()));
-    
+
     miopenIndexType_t index_type = miopenIndexUint64;
     MIOPEN_ENFORCE(miopenSetPoolingIndexType(
         pooling_desc_, index_type));
-    
+
     mio_input_dims = X.sizes().vec();
     std::vector<int> input_dims (begin(mio_input_dims), end(mio_input_dims));
     MIOPEN_ENFORCE(miopenSetTensorDescriptor(
diff --git a/caffe2/operators/multi_class_accuracy_op.cu b/caffe2/operators/multi_class_accuracy_op.cu
index 4baf01de176..3882cfcef76 100644
--- a/caffe2/operators/multi_class_accuracy_op.cu
+++ b/caffe2/operators/multi_class_accuracy_op.cu
@@ -37,8 +37,8 @@ template <>
 bool MultiClassAccuracyOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(PREDICTION);
   auto& label = Input(LABEL);
-  
-  
+
+
   DCHECK_EQ(X.dim(), 2);
   // amount, number of instances
   int N = X.dim32(0);
diff --git a/caffe2/operators/top_k.cc b/caffe2/operators/top_k.cc
index b0326428163..4c3f9993f33 100644
--- a/caffe2/operators/top_k.cc
+++ b/caffe2/operators/top_k.cc
@@ -241,7 +241,7 @@ OPERATOR_SCHEMA(TopK)
       return out;
     })
     .SetDoc(R"DOC(
-Retrieve the top-K elements of the last dimension. 
+Retrieve the top-K elements of the last dimension.
 Given an input tensor of shape $(a_1, a_2, ..., a_n, r)$. `k` can be passed as an integer argument or a 1D tensor containing a single integer.
 Returns up to three outputs:
 
diff --git a/caffe2/opt/optimizer.cc b/caffe2/opt/optimizer.cc
index 1c2f7f1ab0f..2e707a13828 100644
--- a/caffe2/opt/optimizer.cc
+++ b/caffe2/opt/optimizer.cc
@@ -20,7 +20,7 @@ void workspaceOptimizations(nom::repr::NNModule* nn, Workspace* ws, int level) {
 void graphOptimzations(nom::repr::NNModule* nn, int level) {
   switch (level) {
     case 1:
-#ifdef USE_NNPACK 
+#ifdef USE_NNPACK
       opt::addNNPACK(nn, false);
       opt::fuseNNPACKConvRelu(nn);
 #endif
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index e8b718a5a2b..c037f776b6a 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -142,7 +142,7 @@ backend_test.exclude('(test_if_.*'  # added support for sequence type inputs
                      '|test_unsqueeze_.*'  # axes is now an input (not attr)
                      '|test_MaxPool1d_stride_padding_dilation_.*'
                      '|test_MaxPool2d_stride_padding_dilation_.*'
-                     ')') 
+                     ')')
 
 # Skip vgg to speed up CI
 if 'JENKINS_URL' in os.environ:
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 2fb13c14992..60d39d831a9 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -410,7 +410,7 @@ class TestAdam(hu.HypothesisTestCase):
             dc, op,
             [param, mom1, mom2, indices, grad, LR, ITER],
             [0, 1, 2, 3],
-            input_device_options=input_device_options)   
+            input_device_options=input_device_options)
 
         self.assertReferenceChecks(
             gc, op,
diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md
index b90f7f850f5..aadfc553292 100644
--- a/caffe2/python/serialized_test/README.md
+++ b/caffe2/python/serialized_test/README.md
@@ -7,7 +7,7 @@ Major functionality lives in `serialized_test_util.py`
 2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
 3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`.
 4.  Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one zip file per test function. The zip file contains an `inout.npz` file of the inputs, outputs, and meta data (like device type), a `op.pb` file of the operator, and `grad_#.pb` files of the gradients if there are any. Use `-O` to change the output directory. This also generates a markdown document summarizing the coverage of serialized tests. We can disable generating this coverage document using the `-C` flag.
-5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. 
+5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison.
 
 ## Coverage report
 `SerializedTestCoverage.md` contains some statistics about the coverage of serialized tests. It is regenerated every time someone regenerates a serialized test (i.e. running an operator test with the `-G` option). If you run into merge conflicts for the file, please rebase and regenerate. If you'd like to disable generating this file when generating the serialized test, you can run with `-G -C`. The logic for generating this file lives in `coverage.py`.
diff --git a/caffe2/release-notes.md b/caffe2/release-notes.md
index 9816f722b5e..e76b760a7ed 100644
--- a/caffe2/release-notes.md
+++ b/caffe2/release-notes.md
@@ -19,7 +19,7 @@ sudo apt-get install -y --no-install-recommends \
       libprotobuf-dev \
       protobuf-compiler \
       python-dev \
-      python-pip                          
+      python-pip
 sudo pip install numpy protobuf
 ```
 
diff --git a/caffe2/serialize/crc.cc b/caffe2/serialize/crc.cc
index 7ea15dd65c1..7a7173e417f 100644
--- a/caffe2/serialize/crc.cc
+++ b/caffe2/serialize/crc.cc
@@ -5,7 +5,7 @@
 
 extern "C" {
 // See: miniz.h
-#if defined(USE_EXTERNAL_MZCRC) 
+#if defined(USE_EXTERNAL_MZCRC)
 mz_ulong mz_crc32(mz_ulong crc, const mz_uint8* ptr, size_t buf_len) {
   auto z = crc32_fast(ptr, buf_len, crc);
   return z;
diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h
index e7c986ff89f..89db1d564b4 100644
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@@ -1,4 +1,4 @@
-#pragma once 
+#pragma once
 
 // //////////////////////////////////////////////////////////
 // Crc32.h
diff --git a/caffe2/video/CMakeLists.txt b/caffe2/video/CMakeLists.txt
index 0e68d7abcf6..195c3c04a4c 100644
--- a/caffe2/video/CMakeLists.txt
+++ b/caffe2/video/CMakeLists.txt
@@ -34,7 +34,7 @@ if(USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND)
   # ---[ GPU test files
   file(GLOB tmp *_gpu_test.cc)
   set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-  
+
   # ---[ HIP test files
   file(GLOB tmp hip/*_test.cc)
   set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
diff --git a/cmake/Caffe2ConfigVersion.cmake.in b/cmake/Caffe2ConfigVersion.cmake.in
index c13d3df1c09..bba448aab2b 100644
--- a/cmake/Caffe2ConfigVersion.cmake.in
+++ b/cmake/Caffe2ConfigVersion.cmake.in
@@ -1,5 +1,5 @@
 set(PACKAGE_VERSION "@CAFFE2_VERSION@")
- 
+
 # Check whether the requested PACKAGE_FIND_VERSION is compatible
 if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
   set(PACKAGE_VERSION_COMPATIBLE FALSE)
diff --git a/cmake/External/nnpack.cmake b/cmake/External/nnpack.cmake
index b1dcd728e69..a41343cbb5f 100644
--- a/cmake/External/nnpack.cmake
+++ b/cmake/External/nnpack.cmake
@@ -14,7 +14,7 @@ endif()
 ##############################################################################
 
 ##############################################################################
-# (1) MSVC - unsupported 
+# (1) MSVC - unsupported
 ##############################################################################
 
 if(MSVC)
diff --git a/cmake/GoogleTestPatch.cmake b/cmake/GoogleTestPatch.cmake
index d03ed7c923e..36018ace1d8 100644
--- a/cmake/GoogleTestPatch.cmake
+++ b/cmake/GoogleTestPatch.cmake
@@ -3,13 +3,13 @@
 # Patch the cmake file
 #   cmake -DFILENAME=internal_utils.cmake
 #         -DBACKUP=internal_utils.cmake.bak
-#         -DREVERT=0 
-#         -P GoogleTestPatch.cmake 
+#         -DREVERT=0
+#         -P GoogleTestPatch.cmake
 # Revert the changes
 #   cmake -DFILENAME=internal_utils.cmake
 #         -DBACKUP=internal_utils.cmake.bak
-#         -DREVERT=1 
-#         -P GoogleTestPatch.cmake 
+#         -DREVERT=1
+#         -P GoogleTestPatch.cmake
 
 
 if(REVERT)
diff --git a/cmake/Modules/FindMAGMA.cmake b/cmake/Modules/FindMAGMA.cmake
index 2273c83108c..e88d5f311da 100644
--- a/cmake/Modules/FindMAGMA.cmake
+++ b/cmake/Modules/FindMAGMA.cmake
@@ -20,11 +20,11 @@ include(FindPackageHandleStandardArgs)
 SET(MAGMA_LIBRARIES)
 SET(MAGMA_INCLUDE_DIR)
 
-FIND_LIBRARY(MAGMA_LIBRARIES magma 
+FIND_LIBRARY(MAGMA_LIBRARIES magma
   HINTS $ENV{MAGMA_HOME} /usr/local/magma
   PATH_SUFFIXES lib)
 
-FIND_PATH(MAGMA_INCLUDE_DIR magma.h 
+FIND_PATH(MAGMA_INCLUDE_DIR magma.h
   HINTS $ENV{MAGMA_HOME} /usr/local/magma
   PATH_SUFFIXES include)
 
diff --git a/cmake/Modules/FindVSX.cmake b/cmake/Modules/FindVSX.cmake
index 74691f9240f..64f67ceeb0e 100644
--- a/cmake/Modules/FindVSX.cmake
+++ b/cmake/Modules/FindVSX.cmake
@@ -1,7 +1,7 @@
 
 IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
   message("-- <FindVSX>")
-  EXEC_PROGRAM(LD_SHOW_AUXV=1 ARGS "/bin/true" OUTPUT_VARIABLE bintrue) 
+  EXEC_PROGRAM(LD_SHOW_AUXV=1 ARGS "/bin/true" OUTPUT_VARIABLE bintrue)
   if(bintrue MATCHES "AT_PLATFORM:[ \\t\\n\\r]*([a-zA-Z0-9_]+)[ \\t\\n\\r]*")
     if(CMAKE_MATCH_COUNT GREATER 0)
       string(TOLOWER ${CMAKE_MATCH_1} platform)
@@ -12,7 +12,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       endif()
     endif()
   endif()
-  SET(VSX_CODE " #include <altivec.h> 
+  SET(VSX_CODE " #include <altivec.h>
       int main() {
       float __attribute__((aligned(16))) vptr_y[8]   = { 1.0f,2.f,3.f,4.f,4.f,3.f,2.f,1.f };
       __vector float v_result = vec_add(vec_vsx_ld(0, vptr_y), vec_vsx_ld(16, vptr_y));
@@ -22,14 +22,14 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
   SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
   SET(CMAKE_REQUIRED_FLAGS "-mvsx")
   CHECK_C_SOURCE_COMPILES("${VSX_CODE}"  C_VSX_FOUND)
-  CHECK_CXX_SOURCE_COMPILES("${VSX_CODE}"  CXX_VSX_FOUND) 
+  CHECK_CXX_SOURCE_COMPILES("${VSX_CODE}"  CXX_VSX_FOUND)
   SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
   if(CXX_VSX_FOUND)
-    message("-- VSX flag was set.") 
+    message("-- VSX flag was set.")
     SET(CXX_VSX_FLAGS  "${CXX_VSX_FLAGS} -mvsx" )
   elseif(POWER_COMP)
-    message(WARNING "-- VSX flag was not set.")  
+    message(WARNING "-- VSX flag was not set.")
   endif()
-  message("-- </FindVSX>")  
+  message("-- </FindVSX>")
 endif()
 
diff --git a/cmake/Modules/FindZMQ.cmake b/cmake/Modules/FindZMQ.cmake
index 5522190f50f..43ce50b9495 100644
--- a/cmake/Modules/FindZMQ.cmake
+++ b/cmake/Modules/FindZMQ.cmake
@@ -7,7 +7,7 @@
 #  ZMQ_FOUND
 #  ZMQ_INCLUDE_DIR
 #  ZMQ_LIBRARIES
-#  ZMQ_VERSION_MAJOR 
+#  ZMQ_VERSION_MAJOR
 
 find_path(ZMQ_INCLUDE_DIR NAMES zmq.h
                              PATHS ${ZMQ_ROOT_DIR} ${ZMQ_ROOT_DIR}/include)
diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index b10a4767dbc..3f215b5efe9 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -17,7 +17,7 @@ find_path(vecLib_INCLUDE_DIR vecLib.h
           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
                 /System/Library/${__veclib_include_suffix}
                 /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
-                ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}                                        
+                ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
           NO_DEFAULT_PATH)
 
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 1d0b98d6eb9..1870b963ce2 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -75,7 +75,7 @@ else()
   if(@USE_CUDA@)
     append_wholearchive_lib_if_found(torch_cuda c10_cuda)
   endif()
-  
+
   # We need manually add dependent libraries when they are not linked into the
   # shared library.
   # TODO: this list might be incomplete.
@@ -83,7 +83,7 @@ else()
   if(@BUILD_CAFFE2@)
     append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx)
   endif()
-  
+
   if(@USE_NNPACK@)
     append_torchlib_if_found(nnpack)
   endif()
diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake
index a43874b6ffe..d42ea4c9232 100644
--- a/cmake/iOS.cmake
+++ b/cmake/iOS.cmake
@@ -100,7 +100,7 @@ if(IOS_DEPLOYMENT_TARGET)
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
 endif()
 
-# Hidden visibilty is required for cxx on iOS 
+# Hidden visibilty is required for cxx on iOS
 set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
 set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden")
 
@@ -142,7 +142,7 @@ set(CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of
 # Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
 if(NOT DEFINED CMAKE_IOS_SDK_ROOT)
     file(GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
-    if(_CMAKE_IOS_SDKS) 
+    if(_CMAKE_IOS_SDKS)
         list(SORT _CMAKE_IOS_SDKS)
         list(REVERSE _CMAKE_IOS_SDKS)
         list(GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
@@ -156,7 +156,7 @@ set(CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selecte
 # Set the sysroot default to the most recent SDK
 set(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
 
-# set the architecture for iOS 
+# set the architecture for iOS
 if(IOS_PLATFORM STREQUAL "OS")
     set(DEFAULT_IOS_ARCH "arm64")
 elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
diff --git a/codecov.yml b/codecov.yml
index 0ed92247c77..f80f6633e0c 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -4,7 +4,7 @@ coverage:
       default:
         threshold: 1%
     patch: off
-        
+
 codecov:
   notify:
     # Code coverage is collected by 6 configs: codecov_test[12], onnx[12] and windows_test[2]
diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index 6294c7c3b06..dcf2ae7f5f7 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -63,7 +63,7 @@ install_centos() {
                    rocprofiler-dev \
                    roctracer-dev
 }
- 
+
 # Install Python packages depending on the base OS
 if [ -f /etc/lsb-release ]; then
   install_ubuntu
diff --git a/docker/caffe2/readme.md b/docker/caffe2/readme.md
index 48f605c6624..c0cf924d47e 100644
--- a/docker/caffe2/readme.md
+++ b/docker/caffe2/readme.md
@@ -4,7 +4,7 @@
 
 To get the latest source, rerun the docker builds using the Dockerfiles.
 
-Docker images at https://hub.docker.com/r/caffe2ai/caffe2/ are a few months old, but will be refreshed soon.  
+Docker images at https://hub.docker.com/r/caffe2ai/caffe2/ are a few months old, but will be refreshed soon.
 
 **Build like:** `docker build -t caffe2:cuda8-cudnn6-all-options .`
 
diff --git a/docs/caffe2/README.md b/docs/caffe2/README.md
index e780d6735a2..63a73082dc8 100644
--- a/docs/caffe2/README.md
+++ b/docs/caffe2/README.md
@@ -85,7 +85,7 @@ If you want to push these to caffe2.ai, go to your docs checkout:
 1. `cd c2docs`
 2. Copy the files generated in build/docs to your gh-pages branch, commit, and push.
 3. `doxygen-c` and `doxygen-python` both go in the root folder of `gh-pages`
-4. `operators-catalogue.md` goes in `_docs` 
+4. `operators-catalogue.md` goes in `_docs`
 
 ### Running Doxygen Manually
 
diff --git a/docs/cpp/source/notes/tensor_cuda_stream.rst b/docs/cpp/source/notes/tensor_cuda_stream.rst
index 9d9e14704ff..9ecf86f51fe 100644
--- a/docs/cpp/source/notes/tensor_cuda_stream.rst
+++ b/docs/cpp/source/notes/tensor_cuda_stream.rst
@@ -1,4 +1,4 @@
-Tensor CUDA Stream API 
+Tensor CUDA Stream API
 ======================
 
 A `CUDA Stream`_ is a linear sequence of execution that belongs to a specific CUDA device.
@@ -17,7 +17,7 @@ Pytorch's C++ API provides the following ways to acquire CUDA stream:
 1. Acquire a new stream from the CUDA stream pool, streams are preallocated from the pool and returned in a round-robin fashion.
 
 .. code-block:: cpp
-  
+
   CUDAStream getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
 
 .. tip::
@@ -28,21 +28,21 @@ Pytorch's C++ API provides the following ways to acquire CUDA stream:
 2. Acquire the default CUDA stream for the passed CUDA device, or for the current device if no device index is passed.
 
 .. code-block:: cpp
-  
+
   CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
 
 .. tip::
-  
+
   The default stream is where most computation occurs when you aren't explicitly using streams.
 
-3. Acquire the current CUDA stream, for the CUDA device with index ``device_index``, or for the current device if no device index is passed. 
+3. Acquire the current CUDA stream, for the CUDA device with index ``device_index``, or for the current device if no device index is passed.
 
 .. code-block:: cpp
-  
+
   CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
 
 .. tip::
-  
+
   The current CUDA stream will usually be the default CUDA stream for the device, but it may be different if someone
   called ``setCurrentCUDAStream`` or used ``StreamGuard`` or ``CUDAStreamGuard``.
 
@@ -83,14 +83,14 @@ CUDA Stream Usage Examples
 
   // create a tensor on device 0
   torch::Tensor tensor0 = torch::ones({2, 2}, torch::device(torch::kCUDA));
-  // get a new CUDA stream from CUDA stream pool on device 0 
+  // get a new CUDA stream from CUDA stream pool on device 0
   at::cuda::CUDAStream myStream = at::cuda::getStreamFromPool();
   // set current CUDA stream from default stream to `myStream` on device 0
   at::cuda::setCurrentCUDAStream(myStream);
   // sum() on tensor0 uses `myStream` as current CUDA stream
   tensor0.sum();
 
-  // get the default CUDA stream on device 0 
+  // get the default CUDA stream on device 0
   at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
   // set current CUDA stream back to default CUDA stream on devide 0
   at::cuda::setCurrentCUDAStream(defaultStream);
@@ -109,7 +109,7 @@ CUDA Stream Usage Examples
   // set the current CUDA stream to `myStream` within the scope using CUDA stream guard
   {
     at::cuda::CUDAStreamGuard guard(myStream);
-    // current CUDA stream is `myStream` from here till the end of bracket. 
+    // current CUDA stream is `myStream` from here till the end of bracket.
     // sum() on tensor0 uses `myStream` as current CUDA stream
     tensor0.sum();
   }
@@ -118,10 +118,10 @@ CUDA Stream Usage Examples
   tensor0.sum();
 
 .. attention::
-  
+
   Above code is running on the same CUDA device. `setCurrentCUDAStream` will always set current CUDA stream on current device,
   but note that `setCurrentCUDASteram` actually set current stream on the device of passed in CUDA stream.
-  
+
 
 2. Acquiring and setting CUDA streams on multiple devices.
 
@@ -138,7 +138,7 @@ CUDA Stream Usage Examples
   // set current CUDA stream to `myStream1` on device 1
   at::cuda::setCurrentCUDAStream(myStream1);
 
-  // create a tensor on device 0, no need to specify device index since 
+  // create a tensor on device 0, no need to specify device index since
   // current device index is 0
   torch::Tensor tensor0 = torch::ones({2, 2}, torch::device(at::kCUDA));
   // sum() on tensor0 use `myStream0` as current CUDA stream on device 0
@@ -154,23 +154,23 @@ CUDA Stream Usage Examples
   }
 
   // current device is reset to device 0 after device_guard is destroyed
-  
+
   // acquire a new CUDA stream on device 1
   at::cuda::CUDAStream myStream1_1 = at::cuda::getStreamFromPool(false, 1);
   // create a new tensor on device 1
   torch::Tensor tensor1 = torch::ones({2, 2}, torch::device({torch::kCUDA, 1}));
 
-  // change the current device index to 1 and current CUDA stream on device 1 
+  // change the current device index to 1 and current CUDA stream on device 1
   // to `myStream1_1` using CUDA stream guard within a scope
   {
     at::cuda::CUDAStreamGuard stream_guard(myStream1_1);
     // sum() on tensor1 use `myStream1_1` as current CUDA stream on device 1
-    tensor1.sum(); 
+    tensor1.sum();
   }
 
-  // current device is reset to device 0 and current CUDA stream on device 1 is 
+  // current device is reset to device 0 and current CUDA stream on device 1 is
   // reset to `myStream1`
-  
+
   // sum() on tensor1 uses `myStream1` as current CUDA stream on device 1
   tensor1.sum();
 
@@ -181,7 +181,7 @@ CUDA Stream Usage Examples
 
   // This example shows how to use CUDA multistream guard to set
   // two streams on two devices at the same time.
- 
+
   // create two tensor, one on device 0, one on device 1
   torch::Tensor tensor0 = torch::ones({2, 2}, torch::device({torch::kCUDA, 0}));
   torch::Tensor tensor1 = torch::ones({2, 2}, torch::device({torch::kCUDA, 1}));
@@ -190,7 +190,7 @@ CUDA Stream Usage Examples
   at::cuda::CUDAStream myStream0 = at::cuda::getStreamFromPool(false, 0);
   at::cuda::CUDAStream myStream1 = at::cuda::getStreamFromPool(false, 1);
 
-  // set current CUDA stream on device 0 to `myStream0` and 
+  // set current CUDA stream on device 0 to `myStream0` and
   // set current CUDA stream on device 1 to `myStream1` CUDA using multistream guard
   {
     at::cuda::CUDAMultiStreamGuard multi_guard({myStream0, myStream1});
@@ -202,7 +202,7 @@ CUDA Stream Usage Examples
   }
 
   // current CUDA stream on device 0 is reset to default CUDA stream on device 0
-  // current CUDA stream on device 1 is reset to default CUDA stream on device 1 
+  // current CUDA stream on device 1 is reset to default CUDA stream on device 1
 
   // sum() on tensor0 uses default CUDA stream as current CUDA stream on device 0
   tensor0.sum();
@@ -211,7 +211,7 @@ CUDA Stream Usage Examples
 
 .. attention::
   ``CUDAMultiStreamGuard`` does not change current device index, it only changes the stream on
-  each passed in stream's device. Other than scope controlling, this guard is equivalent to 
+  each passed in stream's device. Other than scope controlling, this guard is equivalent to
   calling ``setCurrentCUDAStream`` on each passed in stream.
 
 4. A skeleton example for handling CUDA streams on multiple devices
@@ -225,9 +225,9 @@ CUDA Stream Usage Examples
 
    // Usage 0: acquire CUDA stream and set current CUDA stream with `setCurrentCUDAStream`
    // Create a CUDA stream vector `streams0` on device 0
-   std::vector<at::cuda::CUDAStream> streams0 = 
+   std::vector<at::cuda::CUDAStream> streams0 =
      {at::cuda::getDefaultCUDAStream(), at::cuda::getStreamFromPool()};
-   // set current stream as `streams0[0]` on device 0 
+   // set current stream as `streams0[0]` on device 0
    at::cuda::setCurrentCUDAStream(streams0[0]);
 
    // create a CUDA stream vector `streams1` on device using CUDA device guard
@@ -238,31 +238,31 @@ CUDA Stream Usage Examples
      streams1.push_back(at::cuda::getDefaultCUDAStream());
      streams1.push_back(at::cuda::getStreamFromPool());
    }
-   // device index is reset to 0 after device_guard is destroyed 
+   // device index is reset to 0 after device_guard is destroyed
 
    // set current stream as `streams1[0]` on device 1
    at::cuda::setCurrentCUDAStream(streams1[0]);
 
-   
+
    // Usage 1: use CUDA device guard to change the current device index only
-   { 
+   {
      at::cuda::CUDAGuard device_guard(1);
 
      // current device index is changed to 1 within scope
      // current CUDA stream is still `streams1[0]` on device 1, no change
    }
    // current device index is reset to 0 after `device_guard` is destroyed
-   
-   
+
+
    // Usage 2: use CUDA stream guard to change both current device index and current CUDA stream.
    {
      at::cuda::CUDAStreamGuard stream_guard(streams1[1]);
-     
+
      // current device index and current CUDA stream are set to 1 and `streams1[1]` within scope
    }
-   // current device index and current CUDA stream are reset to 0 and `streams0[0]` after 
+   // current device index and current CUDA stream are reset to 0 and `streams0[0]` after
    // stream_guard is destroyed
-  
+
 
    // Usage 3: use CUDA multi-stream guard to change multiple streams on multiple devices
    {
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index a0752dfd771..3136c4ee782 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -40,7 +40,7 @@ torch.backends.cuda
     .. method::  clear()
 
         Clears the cuFFT plan cache.
-    
+
 
 torch.backends.cudnn
 ^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/source/cudnn_persistent_rnn.rst b/docs/source/cudnn_persistent_rnn.rst
index 986c3303601..96c5d1fe001 100644
--- a/docs/source/cudnn_persistent_rnn.rst
+++ b/docs/source/cudnn_persistent_rnn.rst
@@ -4,9 +4,9 @@
 .. note::
 
     If the following conditions are satisfied:
-    1) cudnn is enabled, 
-    2) input data is on the GPU 
-    3) input data has dtype ``torch.float16`` 
+    1) cudnn is enabled,
+    2) input data is on the GPU
+    3) input data has dtype ``torch.float16``
     4) V100 GPU is used,
     5) input data is not in ``PackedSequence`` format
-    persistent algorithm can be selected to improve performance.  
+    persistent algorithm can be selected to improve performance.
diff --git a/docs/source/cudnn_rnn_determinism.rst b/docs/source/cudnn_rnn_determinism.rst
index 6568b73ef09..c002925a8c3 100644
--- a/docs/source/cudnn_rnn_determinism.rst
+++ b/docs/source/cudnn_rnn_determinism.rst
@@ -1,7 +1,7 @@
 .. warning::
     There are known non-determinism issues for RNN functions on some versions of cuDNN and CUDA.
     You can enforce deterministic behavior by setting the following environment variables:
-    
+
     On CUDA 10.1, set environment variable ``CUDA_LAUNCH_BLOCKING=1``.
     This may affect performance.
 
diff --git a/docs/source/data.rst b/docs/source/data.rst
index b0e2c260399..8559719d93c 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -337,9 +337,9 @@ Randomness in multi-process data loading
 """"""""""""""""""""""""""""""""""""""""""
 
 By default, each worker will have its PyTorch seed set to ``base_seed + worker_id``,
-where ``base_seed`` is a long generated by main process using its RNG (thereby, 
-consuming a RNG state mandatorily). However, seeds for other libraries may be 
-duplicated upon initializing workers (e.g., NumPy), causing each worker to return 
+where ``base_seed`` is a long generated by main process using its RNG (thereby,
+consuming a RNG state mandatorily). However, seeds for other libraries may be
+duplicated upon initializing workers (e.g., NumPy), causing each worker to return
 identical random numbers. (See :ref:`this section <dataloader-workers-random-seed>` in FAQ.).
 
 In :attr:`worker_init_fn`, you may access the PyTorch seed set for each worker
diff --git a/docs/source/multiprocessing.rst b/docs/source/multiprocessing.rst
index 3c699c5dd75..2d324f3eead 100644
--- a/docs/source/multiprocessing.rst
+++ b/docs/source/multiprocessing.rst
@@ -30,7 +30,7 @@ Sharing CUDA tensors
 --------------------
 
 Sharing CUDA tensors between processes is supported only in Python 3, using
-a ``spawn`` or ``forkserver`` start methods. 
+a ``spawn`` or ``forkserver`` start methods.
 
 
 Unlike CPU tensors, the sending process is required to keep the original tensor
diff --git a/docs/source/named_tensor.rst b/docs/source/named_tensor.rst
index 107c9816fa9..02113b9c987 100644
--- a/docs/source/named_tensor.rst
+++ b/docs/source/named_tensor.rst
@@ -5,9 +5,9 @@
 Named Tensors
 =============
 
-Named Tensors allow users to give explicit names to tensor dimensions. 
-In most cases, operations that take dimension parameters will accept 
-dimension names, avoiding the need to track dimensions by position. 
+Named Tensors allow users to give explicit names to tensor dimensions.
+In most cases, operations that take dimension parameters will accept
+dimension names, avoiding the need to track dimensions by position.
 In addition, named tensors use names to automatically check that APIs
 are being used correctly at runtime, providing extra safety. Names can
 also be used to rearrange dimensions, for example, to support
diff --git a/docs/source/notes/ddp.rst b/docs/source/notes/ddp.rst
index b7b2676bb20..1cfd351a616 100644
--- a/docs/source/notes/ddp.rst
+++ b/docs/source/notes/ddp.rst
@@ -146,9 +146,9 @@ ProcessGroup
 - `ProcessGroup.hpp <https://github.com/pytorch/pytorch/blob/v1.7.0/torch/lib/c10d/ProcessGroup.hpp>`__:
   contains the abstract API of all process group implementations. The ``c10d``
   library provides 3 implementations out of the box, namely,
-  `ProcessGroupGloo`, `ProcessGroupNCCL`, and `ProcessGroupMPI`. 
-  ``DistributedDataParallel`` uses ``ProcessGroup::broadcast()`` to send  
-  model states from the process with rank 0 to others during initialization  
+  `ProcessGroupGloo`, `ProcessGroupNCCL`, and `ProcessGroupMPI`.
+  ``DistributedDataParallel`` uses ``ProcessGroup::broadcast()`` to send
+  model states from the process with rank 0 to others during initialization
   and ``ProcessGroup::allreduce()`` to sum gradients.
 
 
diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
index 558ce6d3c7e..3c92a1e9180 100644
--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@@ -111,7 +111,7 @@ deterministic implementation will be used::
     tensor([[[ 1.1900, -2.3409],
              [ 0.4796,  0.8003]],
             [[ 0.1509,  1.8027],
-             [ 0.0333, -1.1444]]], device='cuda:0') 
+             [ 0.0333, -1.1444]]], device='cuda:0')
 
 Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
 should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index d4ac032383e..ca2f423be70 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -97,8 +97,8 @@ in the tensor:
 Since the cloned tensors are independent of each other, however, they have
 none of the view relationships the original tensors did. If both file size and
 view relationships are important when saving tensors smaller than their
-storage objects, then care must be taken to construct new tensors that minimize 
-the size of their storage objects but still have the desired view relationships 
+storage objects, then care must be taken to construct new tensors that minimize
+the size of their storage objects but still have the desired view relationships
 before saving.
 
 Saving and loading torch.nn.Modules
@@ -270,7 +270,7 @@ explicitly described in
 PyTorch’s `release notes <https://github.com/pytorch/pytorch/releases>`_,
 and modules relying on functionality that has changed may need to be updated
 to continue working properly. In limited cases, detailed below, PyTorch will
-preserve the historic behavior of serialized ScriptModules so they do not require 
+preserve the historic behavior of serialized ScriptModules so they do not require
 an update.
 
 torch.div performing integer division
diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst
index 15aaf1b65f8..49689282339 100644
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@@ -30,7 +30,7 @@ MKL and MAGMA. Here are the steps to build with them.
     set CONFIG=release
     curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z
     7z x -aoa magma.7z -omagma
-    
+
     REM Setting essential environment variables
     set "CMAKE_INCLUDE_PATH=%cd%\mkl\include"
     set "LIB=%cd%\mkl\lib;%LIB%"
@@ -44,7 +44,7 @@ As an alternative, we can use ``Ninja`` to parallelize CUDA
 build tasks. It can be used by typing only a few lines of code.
 
 .. code-block:: bat
-    
+
     REM Let's install ninja first.
     pip install ninja
 
@@ -65,7 +65,7 @@ Extension
 CFFI Extension
 ^^^^^^^^^^^^^^
 
-The support for CFFI Extension is very experimental. There're 
+The support for CFFI Extension is very experimental. There're
 generally two steps to enable it under Windows.
 
 First, specify additional ``libraries`` in ``Extension``
@@ -84,7 +84,7 @@ object to make it build on Windows.
        libraries=['ATen', '_C'] # Append cuda libraries when necessary, like cudart
    )
 
-Second, here is a workground for "unresolved external symbol 
+Second, here is a workground for "unresolved external symbol
 state caused by ``extern THCState *state;``"
 
 Change the source code from C to C++. An example is listed below.
@@ -166,7 +166,7 @@ Import error
 
 The problem is caused by the missing of the essential files. Actually,
 we include almost all the essential files that PyTorch need for the conda
-package except VC2017 redistributable and some mkl libraries. 
+package except VC2017 redistributable and some mkl libraries.
 You can resolve this by typing the following command.
 
 .. code-block:: bat
@@ -174,7 +174,7 @@ You can resolve this by typing the following command.
     conda install -c peterjc123 vc vs2017_runtime
     conda install mkl_fft intel_openmp numpy mkl
 
-As for the wheels package, since we didn't pack some libraries and VS2017 
+As for the wheels package, since we didn't pack some libraries and VS2017
 redistributable files in, please make sure you install them manually.
 The `VS 2017 redistributable installer
 <https://aka.ms/vs/15/release/VC_redist.x64.exe>`_ can be downloaded.
@@ -255,7 +255,7 @@ Multiprocessing error "Broken pipe"
 
 This issue happens when the child process ends before the parent process
 finishes sending data. There may be something wrong with your code. You
-can debug your code by reducing the ``num_worker`` of 
+can debug your code by reducing the ``num_worker`` of
 :class:`~torch.utils.data.DataLoader` to zero and see if the issue persists.
 
 Multiprocessing error "driver shut down"
@@ -282,7 +282,7 @@ CUDA IPC operations
 They are not supported on Windows. Something like doing multiprocessing on CUDA
 tensors cannot succeed, there are two alternatives for this.
 
-1. Don't use ``multiprocessing``. Set the ``num_worker`` of 
+1. Don't use ``multiprocessing``. Set the ``num_worker`` of
 :class:`~torch.utils.data.DataLoader` to zero.
 
 2. Share CPU tensors instead. Make sure your custom
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index a1f88a64c83..6709c56e7fe 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -278,7 +278,7 @@ Using dictionaries to handle Named Arguments as model inputs
 ------------------------------------------------------------
 
 There are two ways to handle models which consist of named parameters or keyword arguments as inputs:
- 
+
 * The first method is to pass all the inputs in the same order as required by the model and pass None
   values for the keyword arguments that do not require a value to be passed
 
@@ -288,23 +288,23 @@ There are two ways to handle models which consist of named parameters or keyword
 
 For example, in the model: ::
 
-    class Model(torch.nn.Module): 
-      def forward(self, x, y=None, z=None): 
-        if y is not None: 
-          return x + y 
-        if z is not None: 
-          return x + z 
-        return x 
-    m = Model() 
+    class Model(torch.nn.Module):
+      def forward(self, x, y=None, z=None):
+        if y is not None:
+          return x + y
+        if z is not None:
+          return x + z
+        return x
+    m = Model()
     x = torch.randn(2, 3)
-    z = torch.randn(2, 3) 
+    z = torch.randn(2, 3)
 
 There are two ways of exporting the model:
 
-* Not using a dictionary for the keyword arguments and passing all the inputs in the same order 
+* Not using a dictionary for the keyword arguments and passing all the inputs in the same order
   as required by the model ::
 
-      torch.onnx.export(model, (x, None, z), ‘test.onnx’) 
+      torch.onnx.export(model, (x, None, z), ‘test.onnx’)
 
 * Using a dictionary to represent the keyword arguments. This dictionary is always passed in
   addition to the non-keyword arguments and is always the last argument in the args tuple. ::
@@ -318,25 +318,25 @@ empty or no dictionary. For example, ::
     or
     torch.onnx.export(model, (x, ), ‘test.onnx’)
 
-An exception to this rule are cases in which the last input is also of a dictionary type. 
-In these cases it is mandatory to have an empty dictionary as the last argument in the 
+An exception to this rule are cases in which the last input is also of a dictionary type.
+In these cases it is mandatory to have an empty dictionary as the last argument in the
 args tuple. For example, ::
 
-    class Model(torch.nn.Module): 
-      def forward(self, k, x): 
-        ...  
-        return x 
-    m = Model() 
-    k = torch.randn(2, 3)   
+    class Model(torch.nn.Module):
+      def forward(self, k, x):
+        ...
+        return x
+    m = Model()
+    k = torch.randn(2, 3)  
     x = {torch.tensor(1.): torch.randn(2, 3)}
 
-Without the presence of the empty dictionary, the export call assumes that the 
-‘x’ input is intended to represent the optional dictionary consisting of named arguments. 
-In order to prevent this from being an issue a constraint is placed to provide an empty 
-dictionary as the last input in the tuple args in such cases. 
-The new call would look like this. :: 
+Without the presence of the empty dictionary, the export call assumes that the
+‘x’ input is intended to represent the optional dictionary consisting of named arguments.
+In order to prevent this from being an issue a constraint is placed to provide an empty
+dictionary as the last input in the tuple args in such cases.
+The new call would look like this. ::
 
-    torch.onnx.export(model, (k, x, {}), ‘test.onnx’) 
+    torch.onnx.export(model, (k, x, {}), ‘test.onnx’)
 
 
 Indexing
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index 936206a5e2d..e4384cc5d7f 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -179,10 +179,10 @@ should write your code this way:
 Stochastic Weight Averaging
 ---------------------------
 
-:mod:`torch.optim.swa_utils` implements Stochastic Weight Averaging (SWA). In particular, 
-:class:`torch.optim.swa_utils.AveragedModel` class implements SWA models, 
-:class:`torch.optim.swa_utils.SWALR` implements the SWA learning rate scheduler and 
-:func:`torch.optim.swa_utils.update_bn` is a utility function used to update SWA batch 
+:mod:`torch.optim.swa_utils` implements Stochastic Weight Averaging (SWA). In particular,
+:class:`torch.optim.swa_utils.AveragedModel` class implements SWA models,
+:class:`torch.optim.swa_utils.SWALR` implements the SWA learning rate scheduler and
+:func:`torch.optim.swa_utils.update_bn` is a utility function used to update SWA batch
 normalization statistics at the end of training.
 
 SWA has been proposed in `Averaging Weights Leads to Wider Optima and Better Generalization`_.
@@ -192,12 +192,12 @@ SWA has been proposed in `Averaging Weights Leads to Wider Optima and Better Gen
 Constructing averaged models
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-`AveragedModel` class serves to compute the weights of the SWA model. You can create an 
+`AveragedModel` class serves to compute the weights of the SWA model. You can create an
 averaged model by running:
 
 >>> swa_model = AveragedModel(model)
 
-Here the model ``model`` can be an arbitrary :class:`torch.nn.Module` object. ``swa_model`` 
+Here the model ``model`` can be an arbitrary :class:`torch.nn.Module` object. ``swa_model``
 will keep track of the running averages of the parameters of the ``model``. To update these
 averages, you can use the :func:`update_parameters` function:
 
@@ -207,15 +207,15 @@ averages, you can use the :func:`update_parameters` function:
 SWA learning rate schedules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Typically, in SWA the learning rate is set to a high constant value. :class:`SWALR` is a 
-learning rate scheduler that anneals the learning rate to a fixed value, and then keeps it 
-constant. For example, the following code creates a scheduler that linearly anneals the 
+Typically, in SWA the learning rate is set to a high constant value. :class:`SWALR` is a
+learning rate scheduler that anneals the learning rate to a fixed value, and then keeps it
+constant. For example, the following code creates a scheduler that linearly anneals the
 learning rate from its initial value to 0.05 in 5 epochs within each parameter group:
 
 >>> swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, \
 >>>         anneal_strategy="linear", anneal_epochs=5, swa_lr=0.05)
 
-You can also use cosine annealing to a fixed value instead of linear annealing by setting 
+You can also use cosine annealing to a fixed value instead of linear annealing by setting
 ``anneal_strategy="cos"``.
 
 
@@ -225,13 +225,13 @@ Taking care of batch normalization
 :func:`update_bn` is a utility function that allows to compute the batchnorm statistics for the SWA model
 on a given dataloader ``loader`` at the end of training:
 
->>> torch.optim.swa_utils.update_bn(loader, swa_model) 
+>>> torch.optim.swa_utils.update_bn(loader, swa_model)
 
 :func:`update_bn` applies the ``swa_model`` to every element in the dataloader and computes the activation
 statistics for each batch normalization layer in the model.
 
 .. warning::
-    :func:`update_bn` assumes that each batch in the dataloader ``loader`` is either a tensors or a list of 
+    :func:`update_bn` assumes that each batch in the dataloader ``loader`` is either a tensors or a list of
     tensors where the first element is the tensor that the network ``swa_model`` should be applied to.
     If your dataloader has a different structure, you can update the batch normalization statistics of the
     ``swa_model`` by doing a forward pass with the ``swa_model`` on each element of the dataset.
@@ -240,8 +240,8 @@ statistics for each batch normalization layer in the model.
 Custom averaging strategies
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-By default, :class:`torch.optim.swa_utils.AveragedModel` computes a running equal average of 
-the parameters that you provide, but you can also use custom averaging functions with the 
+By default, :class:`torch.optim.swa_utils.AveragedModel` computes a running equal average of
+the parameters that you provide, but you can also use custom averaging functions with the
 ``avg_fn`` parameter. In the following example ``ema_model`` computes an exponential moving average.
 
 Example:
@@ -255,8 +255,8 @@ Putting it all together
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 In the example below, ``swa_model`` is the SWA model that accumulates the averages of the weights.
-We train the model for a total of 300 epochs and we switch to the SWA learning rate schedule 
-and start to collect SWA averages of the parameters at epoch 160: 
+We train the model for a total of 300 epochs and we switch to the SWA learning rate schedule
+and start to collect SWA averages of the parameters at epoch 160:
 
 >>> loader, optimizer, model, loss_fn = ...
 >>> swa_model = torch.optim.swa_utils.AveragedModel(model)
@@ -274,8 +274,8 @@ and start to collect SWA averages of the parameters at epoch 160:
 >>>           swa_scheduler.step()
 >>>       else:
 >>>           scheduler.step()
->>> 
+>>>
 >>> # Update bn statistics for the swa_model at the end
 >>> torch.optim.swa_utils.update_bn(loader, swa_model)
->>> # Use swa_model to make predictions on test data 
+>>> # Use swa_model to make predictions on test data
 >>> preds = swa_model(test_input)
diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst
index 6f52e622914..08274eee9fe 100644
--- a/docs/source/pipeline.rst
+++ b/docs/source/pipeline.rst
@@ -3,9 +3,9 @@
 Pipeline Parallelism
 ====================
 
-Pipeline parallelism was original introduced in the 
-`Gpipe <https://arxiv.org/abs/1811.06965>`__  paper and is an efficient 
-technique to train large models on multiple GPUs. 
+Pipeline parallelism was original introduced in the
+`Gpipe <https://arxiv.org/abs/1811.06965>`__  paper and is an efficient
+technique to train large models on multiple GPUs.
 
 .. warning ::
      Pipeline Parallelism is experimental and subject to change.
@@ -13,32 +13,32 @@ technique to train large models on multiple GPUs.
 Model Parallelism using multiple GPUs
 -------------------------------------
 
-Typically for large models which don't fit on a single GPU, model parallelism 
-is employed where certain parts of the model are placed on different GPUs. 
-Although, if this is done naively for sequential models, the training process 
-suffers from GPU under utilization since only one GPU is active at one time as 
+Typically for large models which don't fit on a single GPU, model parallelism
+is employed where certain parts of the model are placed on different GPUs.
+Although, if this is done naively for sequential models, the training process
+suffers from GPU under utilization since only one GPU is active at one time as
 shown in the figure below:
 
 .. figure:: _static/img/pipeline_parallelism/no_pipe.png
 
-   The figure represents a model with 4 layers placed on 4 different GPUs 
-   (vertical axis). The horizontal axis represents training this model through 
-   time demonstrating that only 1 GPU is utilized at a time 
+   The figure represents a model with 4 layers placed on 4 different GPUs
+   (vertical axis). The horizontal axis represents training this model through
+   time demonstrating that only 1 GPU is utilized at a time
    (`image source <https://arxiv.org/abs/1811.06965>`__).
 
 Pipelined Execution
 -------------------
 
-To alleviate this problem, pipeline parallelism splits the input minibatch into 
-multiple microbatches and pipelines the execution of these microbatches across 
+To alleviate this problem, pipeline parallelism splits the input minibatch into
+multiple microbatches and pipelines the execution of these microbatches across
 multiple GPUs. This is outlined in the figure below:
 
 .. figure:: _static/img/pipeline_parallelism/pipe.png
 
-   The figure represents a model with 4 layers placed on 4 different GPUs 
-   (vertical axis). The horizontal axis represents training this model through 
-   time demonstrating that the GPUs are utilized much more efficiently. 
-   However, there still exists a bubble (as demonstrated in the figure) where 
+   The figure represents a model with 4 layers placed on 4 different GPUs
+   (vertical axis). The horizontal axis represents training this model through
+   time demonstrating that the GPUs are utilized much more efficiently.
+   However, there still exists a bubble (as demonstrated in the figure) where
    certain GPUs are not utilized.
    (`image source <https://arxiv.org/abs/1811.06965>`__).
 
@@ -50,11 +50,11 @@ Pipe APIs in PyTorch
 Skip connections
 ^^^^^^^^^^^^^^^^
 
-Certain models like ResNeXt are not completely sequential and have skip 
-connections between layers. Naively implementing as part of pipeling 
-parallelism would imply that we need to copy outputs for certain layers through 
-multiple GPUs till we eventually reach the GPU where the layer for the skip 
-connection resides. To avoid this copy overhead, we provide APIs below to stash 
+Certain models like ResNeXt are not completely sequential and have skip
+connections between layers. Naively implementing as part of pipeling
+parallelism would imply that we need to copy outputs for certain layers through
+multiple GPUs till we eventually reach the GPU where the layer for the skip
+connection resides. To avoid this copy overhead, we provide APIs below to stash
 and pop Tensors in different layers of the model.
 
 .. autofunction:: torch.distributed.pipeline.sync.skip.skippable.skippable
@@ -65,7 +65,7 @@ and pop Tensors in different layers of the model.
 Acknowledgements
 ----------------
 
-The implementation for pipeline parallelism is based on `fairscale's pipe implementation <https://github.com/facebookresearch/fairscale/tree/master/fairscale/nn/pipe>`__ and 
-`torchgpipe <https://github.com/kakaobrain/torchgpipe>`__. We would like to 
-thank both teams for their contributions and guidance towards bringing pipeline 
+The implementation for pipeline parallelism is based on `fairscale's pipe implementation <https://github.com/facebookresearch/fairscale/tree/master/fairscale/nn/pipe>`__ and
+`torchgpipe <https://github.com/kakaobrain/torchgpipe>`__. We would like to
+thank both teams for their contributions and guidance towards bringing pipeline
 parallelism into PyTorch.
diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
index 2b068f730b6..0ec4ff7cf8b 100644
--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@@ -259,10 +259,10 @@ details.
 RemoteModule
 ------------
 
-``RemoteModule`` is an easy way to create an nn.Module remotely on a different 
-process. The actual module resides on a remote host, but the local host has a 
-handle to this module and invoke this module similar to a regular nn.Module. 
-The invocation however incurs RPC calls to the remote end and can be performed 
+``RemoteModule`` is an easy way to create an nn.Module remotely on a different
+process. The actual module resides on a remote host, but the local host has a
+handle to this module and invoke this module similar to a regular nn.Module.
+The invocation however incurs RPC calls to the remote end and can be performed
 asynchronously if needed via additional APIs supported by RemoteModule.
 
 .. autoclass:: torch.distributed.nn.api.remote_module.RemoteModule
diff --git a/docs/source/rpc/rref.rst b/docs/source/rpc/rref.rst
index 3d519711103..212669f42fd 100644
--- a/docs/source/rpc/rref.rst
+++ b/docs/source/rpc/rref.rst
@@ -42,7 +42,7 @@ Assumptions
 RRef protocol is designed with the following assumptions.
 
 - **Transient Network Failures**: The RRef design handles transient
-  network failures by retrying messages. It cannot handle node crashes or 
+  network failures by retrying messages. It cannot handle node crashes or
   permanent network partitions. When those incidents occur, the application
   should take down all workers, revert to the previous checkpoint, and resume
   training.
@@ -136,7 +136,7 @@ owner before Y's messages. the owner will learn of Z's deletion befores
 knowing Y exists. Nevertheless, this does not cause any problem. Because, at least
 one of Y's ancestors will be alive (A) and it will
 prevent the owner from deleting the ``OwnerRRef``. More specifically, if the
-owner does not know Y, A cannot be deleted due to **G2**, and the owner knows A 
+owner does not know Y, A cannot be deleted due to **G2**, and the owner knows A
 since it is A's parent.
 
 Things get a little trickier if the RRef is created on a user:
diff --git a/docs/source/torch.nn.quantized.dynamic.rst b/docs/source/torch.nn.quantized.dynamic.rst
index 1302d73818a..7081d74abba 100644
--- a/docs/source/torch.nn.quantized.dynamic.rst
+++ b/docs/source/torch.nn.quantized.dynamic.rst
@@ -22,7 +22,7 @@ GRUCell
 ~~~~~~~~~~~~~~~
 .. autoclass:: GRUCell
     :members:
-    
+
 RNNCell
 ~~~~~~~~~~~~~~~
 .. autoclass:: RNNCell
diff --git a/docs/source/torch.quantization.rst b/docs/source/torch.quantization.rst
index 6397c9c4f4b..1ae60a7c9de 100644
--- a/docs/source/torch.quantization.rst
+++ b/docs/source/torch.quantization.rst
@@ -65,4 +65,4 @@ Debugging utilities
     :nosignatures:
 
     nn.intrinsic
- 
+
diff --git a/ios/README.md b/ios/README.md
index cd2d31afa12..4610aa46f2b 100644
--- a/ios/README.md
+++ b/ios/README.md
@@ -17,9 +17,9 @@ For Objective-C developers, simply import the umbrella header
 #import <LibTorch/LibTorch.h>
 ```
 
-For Swift developers, you need to create an Objective-C class as a bridge to call the C++ APIs. We highly recommend you to follow the [Image Classification](https://github.com/pytorch/ios-demo-app/tree/master/PyTorchDemo) demo where you can find out how C++, Objective-C and Swift work together. 
+For Swift developers, you need to create an Objective-C class as a bridge to call the C++ APIs. We highly recommend you to follow the [Image Classification](https://github.com/pytorch/ios-demo-app/tree/master/PyTorchDemo) demo where you can find out how C++, Objective-C and Swift work together.
 
-### Disable Bitcode 
+### Disable Bitcode
 
 Since PyTorch is not yet built with bitcode support, you need to disable bitcode for your target by selecting the **Build Settings**, searching for **Enable Bitcode** and set the value to **No**.
 
diff --git a/ios/TestApp/bootstrap.sh b/ios/TestApp/bootstrap.sh
index 4708ca643ca..3836e9acc2a 100755
--- a/ios/TestApp/bootstrap.sh
+++ b/ios/TestApp/bootstrap.sh
@@ -21,7 +21,7 @@ bootstrap() {
     if ! [ -x "$(command -v ios-deploy)" ]; then
         echo 'Error: ios-deploy is not installed.'
         exit 1
-    fi 
+    fi
     ios-deploy -c -t 1
     if [ "$?" -ne "0" ]; then
         echo 'Error: No device connected. Please connect your device via USB then re-run the script'
@@ -33,18 +33,18 @@ bootstrap() {
     XCODE_PROJ_PATH="./TestApp.xcodeproj"
     XCODE_TARGET="TestApp"
     XCODE_BUILD="./build"
-    if [ ! -f "./.config" ]; then 
+    if [ ! -f "./.config" ]; then
         touch .config
         echo "" >> .config
     else
         source .config
     fi
-    if [ -z "${TEAM_ID}" ]; then 
+    if [ -z "${TEAM_ID}" ]; then
         reply=$(bash -c 'read -r -p "Team Id:" tmp; echo $tmp')
         TEAM_ID="${reply}"
         echo "TEAM_ID=${TEAM_ID}" >> .config
     fi
-    if [ -z "${PROFILE}" ]; then 
+    if [ -z "${PROFILE}" ]; then
         reply=$(bash -c 'read -r -p "Provisioning Profile:" tmp; echo $tmp')
         PROFILE="${reply}"
         echo "PROFILE=${PROFILE}" >> .config
@@ -52,7 +52,7 @@ bootstrap() {
     if [ -d "${XCODE_BUILD}" ]; then
         echo "found the old XCode build, remove it"
         rm -rf "${XCODE_BUILD}"
-    fi 
+    fi
     cd "${BENCHMARK_DIR}"
     echo "Generating model"
     python trace_model.py
@@ -62,7 +62,7 @@ bootstrap() {
     if ! [ -x "$(command -v xcodebuild)" ]; then
         echo 'Error: xcodebuild is not installed.'
         exit 1
-    fi 
+    fi
     echo "Running xcodebuild"
     xcodebuild clean build  -project ${XCODE_PROJ_PATH}  \
                             -target ${XCODE_TARGET}  \
@@ -78,7 +78,7 @@ while [[ $# -gt 1 ]]
 do
 option="$1"
 value="$2"
-case $option in 
+case $option in
     "" | "-h" | "--help")
     help
     exit 0
@@ -92,12 +92,12 @@ case $option in
     shift
     ;;
     *)
-    echo "unknown options" >& 2 
+    echo "unknown options" >& 2
     help
     exit 1
     ;;
 esac
-shift 
+shift
 done
 
 bootstrap
diff --git a/scripts/README.md b/scripts/README.md
index 06b5eb20c8c..a1c5ae5f93e 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -15,7 +15,7 @@ export ANDROID_NDK=YOUR_NDK_PATH
 #in your PyTorch root directory
 bash scripts/build_android.sh
 ```
-If succeeded, the libraries and headers would be generated to build_android/install directory. You can then copy these files from build_android/install to your Android project for further usage. 
+If succeeded, the libraries and headers would be generated to build_android/install directory. You can then copy these files from build_android/install to your Android project for further usage.
 
 You can also override the cmake flags via command line, e.g., following command will also compile the executable binary files:
 ```bash
@@ -37,4 +37,4 @@ brew install cmake automake libtool
 #in your PyTorch root directory
 bash scripts/build_ios.sh
 ```
-If succeeded, the libraries and headers would be generated to build_ios/install directory. You can then copy these files  to your Xcode project for further usage. 
\ No newline at end of file
+If succeeded, the libraries and headers would be generated to build_ios/install directory. You can then copy these files  to your Xcode project for further usage.
diff --git a/scripts/build_raspbian.sh b/scripts/build_raspbian.sh
index ae61a6dcb67..b1fe8592621 100755
--- a/scripts/build_raspbian.sh
+++ b/scripts/build_raspbian.sh
@@ -2,7 +2,7 @@
 ##############################################################################
 # Example command to build the Raspbian target.
 ##############################################################################
-# 
+#
 # This script shows how one can build a Caffe2 binary for raspbian. The build
 # is essentially much similar to a host build, with one additional change
 # which is to specify -mfpu=neon for optimized speed.
diff --git a/scripts/build_tegra_x1.sh b/scripts/build_tegra_x1.sh
index 0a8d7a02287..49c559ae389 100755
--- a/scripts/build_tegra_x1.sh
+++ b/scripts/build_tegra_x1.sh
@@ -2,7 +2,7 @@
 ##############################################################################
 # Example command to build Caffe2 on Tegra X1.
 ##############################################################################
-# 
+#
 # This script shows how one can build a Caffe2 binary for NVidia's TX1.
 # The build script assumes that you have the most recent libraries installed
 # via the JetPack toolkit available at
diff --git a/scripts/diagnose_protobuf.py b/scripts/diagnose_protobuf.py
index 00ffac9d123..f8627b60bd1 100644
--- a/scripts/diagnose_protobuf.py
+++ b/scripts/diagnose_protobuf.py
@@ -22,7 +22,7 @@ try:
     import google.protobuf
     python_version = google.protobuf.__version__
     python_protobuf_installed = True
-except ImportError: 
+except ImportError:
     print("DEBUG: cannot find python protobuf install.")
     python_protobuf_installed = False
 
diff --git a/scripts/xcode_build.rb b/scripts/xcode_build.rb
index 810c23352fd..1175a8fab88 100644
--- a/scripts/xcode_build.rb
+++ b/scripts/xcode_build.rb
@@ -79,5 +79,5 @@ if not profile and options[:platform] == 'OS'
 end
 
 # run xcodebuild
-exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile} -arch #{arch}" 
+exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile} -arch #{arch}"
 
diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp
index eddb80929be..2d3cb0dfffa 100644
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@@ -924,7 +924,7 @@ TEST(DataTest, CanSaveAndLoadDistributedRandomSampler) {
   }
   {
     samplers::DistributedRandomSampler a(10);
-    a.set_epoch(3); 
+    a.set_epoch(3);
     std::stringstream stream;
     torch::save(a, stream);
 
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 739e7976159..00abfbd67de 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -731,7 +731,7 @@ TEST_F(FunctionalTest, TripletMarginWithDistanceLossDefaultParity) {
   for (auto& reduction : reductions) {
     for (auto& margin : margins) {
       for (const auto& swap : swaps) {
-        auto anchor = 
+        auto anchor =
             torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
         auto positive =
             torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp
index 824b020295f..91c8e73b560 100644
--- a/test/cpp/api/nn_utils.cpp
+++ b/test/cpp/api/nn_utils.cpp
@@ -268,7 +268,7 @@ TEST_F(PackedSequenceTest, TotalLength) {
           /*total_length=*/total_length);
       };
       ASSERT_THROWS_WITH(err_fn(),
-        "Expected total_length to be at least the length of the longest sequence in input");     
+        "Expected total_length to be at least the length of the longest sequence in input");
     }
   }
 
@@ -493,7 +493,7 @@ TEST_F(NNUtilsTest, PackPaddedSequence) {
       auto src = padded;
       if (batch_first) {
         src = src.transpose(0, 1);
-      }  
+      }
 
       // check output
       rnn_utils::PackedSequence packed = rnn_utils::pack_padded_sequence(
diff --git a/test/cpp/dist_autograd/test_dist_autograd.cpp b/test/cpp/dist_autograd/test_dist_autograd.cpp
index af335833bb8..0376cd4efe4 100644
--- a/test/cpp/dist_autograd/test_dist_autograd.cpp
+++ b/test/cpp/dist_autograd/test_dist_autograd.cpp
@@ -23,7 +23,7 @@ class DistAutogradTest : public ::testing::Test {
   virtual void TearDown() {
     autogradContainer_->releaseContext(autogradContainer_->currentContext()->contextId());
   }
-  
+
   static DistAutogradContainer* autogradContainer_;
 };
 
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index 9c49726bf51..be000261e8e 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -381,11 +381,11 @@ class TestDistributedNNFunctions(MultiProcessTestCase):
         tensors = torch.distributed.nn.gather(x, 1)
         if self.rank == 1:
             for i, t in enumerate(tensors):
-                self.assertEqual(t, torch.ones(5, 5, device=device) + i)  
+                self.assertEqual(t, torch.ones(5, 5, device=device) + i)
         elif self.rank == 0:
             for i, t in enumerate(tensors):
                 zeros = torch.zeros(5, 5, device=device)
-                self.assertEqual(t, zeros)  
+                self.assertEqual(t, zeros)
         y = torch.sum(torch.stack(tensors), axis=0)
         z = y.sin().sum()
         z.backward()
@@ -409,9 +409,9 @@ class TestDistributedNNFunctions(MultiProcessTestCase):
 
         y = torch.distributed.nn.scatter([x0, x1], 1)
         if self.rank == 1:
-            self.assertEqual(y, 1 + torch.ones(5, 5, device=device))  
+            self.assertEqual(y, 1 + torch.ones(5, 5, device=device))
         elif self.rank == 0:
-            self.assertEqual(y, torch.ones(5, 5, device=device))  
+            self.assertEqual(y, torch.ones(5, 5, device=device))
         z = y.sin().sum()
         z.backward()
 
@@ -437,7 +437,7 @@ class TestDistributedNNFunctions(MultiProcessTestCase):
         y = torch.distributed.nn.reduce(x, 1, op=c10d.ReduceOp.SUM)
 
         if self.rank == 1:
-            self.assertEqual(y, 3 * torch.ones(5, 5, device=device))  
+            self.assertEqual(y, 3 * torch.ones(5, 5, device=device))
 
         z = y.sin().sum()
         z.backward()
@@ -457,7 +457,7 @@ class TestDistributedNNFunctions(MultiProcessTestCase):
         x.requires_grad = True
         y = torch.distributed.nn.all_reduce(x, op=c10d.ReduceOp.SUM)
 
-        self.assertEqual(y, 3 * torch.ones(5, 5, device=device))  
+        self.assertEqual(y, 3 * torch.ones(5, 5, device=device))
 
         z = y.sin().sum()
         z.backward()
@@ -476,7 +476,7 @@ class TestDistributedNNFunctions(MultiProcessTestCase):
         x.requires_grad = True
         tensors = torch.distributed.nn.all_gather(x)
         for i, t in enumerate(tensors):
-            self.assertEqual(t, torch.ones(5, 5, device=device) + i)  
+            self.assertEqual(t, torch.ones(5, 5, device=device) + i)
         y = torch.sum(torch.stack(tensors), axis=0)
         z = y.sin().sum()
         z.backward()
@@ -498,7 +498,7 @@ class TestDistributedNNFunctions(MultiProcessTestCase):
         x1.requires_grad = True
         tensors = torch.distributed.nn.all_to_all([x0, x1])
         for i, t in enumerate(tensors):
-            self.assertEqual(t, torch.ones(5, 5, device=device) + 2 * i)  
+            self.assertEqual(t, torch.ones(5, 5, device=device) + 2 * i)
         y = torch.sum(torch.stack(tensors), axis=0)
         z = y.sin().sum()
         z.backward()
diff --git a/test/jit/test_isinstance.py b/test/jit/test_isinstance.py
index 2e93c280c10..5a96427c888 100644
--- a/test/jit/test_isinstance.py
+++ b/test/jit/test_isinstance.py
@@ -104,8 +104,8 @@ class TestIsinstance(JitTestCase):
             assert torch.jit.isinstance(x, Optional[torch.Tensor])
             # assert torch.jit.isinstance(x, Optional[str])
             # TODO: above line in eager will evaluate to True while in
-            #       the TS interpreter will evaluate to False as the 
-            #       first torch.jit.isinstance refines the 'None' type 
+            #       the TS interpreter will evaluate to False as the
+            #       first torch.jit.isinstance refines the 'None' type
 
         x = None
         self.checkScript(optional_test_none, (x,))
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 4f71b7d1e74..06215df8cce 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2759,7 +2759,7 @@ class TestONNXRuntime(unittest.TestCase):
         self.run_test(ScatterModel(), input=(input, indices, values))
 
         @torch.jit.script
-        def scatter_sum(src: torch.Tensor, index: torch.Tensor): 
+        def scatter_sum(src: torch.Tensor, index: torch.Tensor):
             size = src.size()
             out = torch.zeros(size, dtype=src.dtype)
             return out.scatter_add_(1, index, src)
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 9adca885a06..38d9036f182 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -242,7 +242,7 @@ class TestUtilityFuns(TestCase):
 
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
-        x = torch.randn(2, 3, 4, 5, 8, 7) 
+        x = torch.randn(2, 3, 4, 5, 8, 7)
         graph, _, __ = self._model_to_graph(PReluModel(), x)
 
         for node in graph.nodes():
diff --git a/test/run_test.py b/test/run_test.py
index 1087ac1c216..d50efa0a060 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -397,9 +397,9 @@ def calculate_job_times(reports: List[Dict[str, Any]]) -> Dict[str, Tuple[float,
 
 
 def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, Tuple[float, int]]) -> List[Tuple[float, List[str]]]:
-    # if there's 'test_cpp_extensions_aot' entry in job_times, add 'test_cpp_extensions_aot_ninja' 
-    # and 'test_cpp_extensions_aot_no_ninja' duplicate entries to ease future computation since 
-    # test_cpp_extensions_aot_no_ninja and test_cpp_extensions_aot_ninja are Python test jobs that 
+    # if there's 'test_cpp_extensions_aot' entry in job_times, add 'test_cpp_extensions_aot_ninja'
+    # and 'test_cpp_extensions_aot_no_ninja' duplicate entries to ease future computation since
+    # test_cpp_extensions_aot_no_ninja and test_cpp_extensions_aot_ninja are Python test jobs that
     # both use the test_cpp_extensions_aot.py file.
     if 'test_cpp_extensions_aot' in job_times:
         job_times['test_cpp_extensions_aot_ninja'] = job_times['test_cpp_extensions_aot']
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 28d06b1544b..0d5daa017af 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -89,7 +89,7 @@ class TestForeach(TestCase):
                 tensors2 = self._get_test_data(device, dtype, N)
 
                 # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
-                control_dtype = torch.float32 if (self.device_type == 'cuda' and 
+                control_dtype = torch.float32 if (self.device_type == 'cuda' and
                                                   (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
 
                 if not isinstance(vals, list):
@@ -490,16 +490,16 @@ class TestForeach(TestCase):
                 # Bool case
                 if dtype == torch.bool:
                     if foreach_bin_op == torch._foreach_sub:
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): 
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
                             expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
 
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): 
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
                             res = foreach_bin_op(tensors, scalars)
 
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): 
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
                             [t.sub_(scalar) for t, scalar in zip(tensors, scalars)]
 
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): 
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
                             foreach_bin_op_(tensors, scalars)
                         continue
 
diff --git a/test/test_ops.py b/test/test_ops.py
index b6c336ee953..a5f5dc1a881 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -400,9 +400,9 @@ class TestCommon(JitCommonTestCase):
         original_name_inplace = original_name + "_"
         expected_dtype = op(*sample.input, *sample.args, **sample.kwargs).dtype
 
-        for a_op in op.aliases:  
+        for a_op in op.aliases:
             inplace = a_op.inplace_variant
-            method_or_inplace = [a_op.inplace_variant, a_op.method_variant]            
+            method_or_inplace = [a_op.inplace_variant, a_op.method_variant]
             variants = (v for v in (a_op.op, a_op.method_variant, a_op.inplace_variant) if v is not None)
 
             # Test scripting:
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 8b416c18f51..4d1bc922a6f 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -3022,7 +3022,7 @@ class TestSparse(TestCase):
 
     def test_sparse_matmul(self):
         """
-        This function test `torch.sparse.mm` when both the mat1 and mat2 are sparse tensors. 
+        This function test `torch.sparse.mm` when both the mat1 and mat2 are sparse tensors.
         """
 
         def _indices2csr(indices, dim):
@@ -3134,10 +3134,10 @@ class TestSparse(TestCase):
             def fn(D1, D2):
                 return torch.sparse.mm(D1, D2).to_dense()
 
-            # For cuda, `nondet_tol` is set with `1e-5` 
+            # For cuda, `nondet_tol` is set with `1e-5`
             # This is because cuSparse sometimes returns approximate zero values like `~e-323`
-            # TODO: Check this cuSparse issue. 
-            # This happens when you do chain multiplication `torch.sparse.mm` operations 
+            # TODO: Check this cuSparse issue.
+            # This happens when you do chain multiplication `torch.sparse.mm` operations
             gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5)
             grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b)
 
@@ -3150,8 +3150,8 @@ class TestSparse(TestCase):
             # This is not a matrix
             self.assertRaises(RuntimeError, lambda: fn(3, 4, [2, 2, 2], [2, 2, 2]))
 
-            # Shapes does not 
-            self.assertRaisesRegex(RuntimeError, 
+            # Shapes does not
+            self.assertRaisesRegex(RuntimeError,
                                    r"mat1 and mat2 shapes cannot be multiplied \(2x3 and 4x2\)",
                                    lambda: fn(2, 10, [2, 3], [4, 2]))
 
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 9122975da58..3575095066d 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -621,7 +621,7 @@ class TestTensorCreation(TestCase):
         self.assertEqual(a, b)
         self.assertEqual(w[:6], y.view(-1)[:6])
 
-        # Case: 
+        # Case:
         # Reference: https://github.com/pytorch/pytorch/issues/49878
         for dim in [0, 1]:
             x = torch.zeros((10, 5, 2), device=device)
diff --git a/tools/docker/Dockerfile_runtime b/tools/docker/Dockerfile_runtime
index c987cfa11ea..410253dd5d5 100644
--- a/tools/docker/Dockerfile_runtime
+++ b/tools/docker/Dockerfile_runtime
@@ -1,8 +1,8 @@
-FROM ubuntu:16.04 
+FROM ubuntu:16.04
 
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 RUN apt-get update && apt-get install -y --no-install-recommends \
-         build-essential \ 
+         build-essential \
          git \
          curl \
          ca-certificates \
@@ -12,11 +12,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
      chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \     
+     ~/miniconda.sh -b -p /opt/conda && \
      rm ~/miniconda.sh && \
      /opt/conda/bin/conda install conda-build && \
      /opt/conda/bin/conda install numpy pyyaml scipy ipython&& \
-     /opt/conda/bin/conda clean -ya 
+     /opt/conda/bin/conda clean -ya
 ENV PATH /opt/conda/bin:$PATH
 RUN conda install pytorch torchvision cuda90 -c pytorch && /opt/conda/bin/conda clean -ya
 
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 2bc478f3600..493254711e4 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -90,7 +90,7 @@ static PyObject * THPGenerator_getState(PyObject *_self, PyObject *noargs)
 static PyObject * THPGenerator_setState(PyObject *_self, PyObject *_new_state)
 {
   using namespace torch::autograd;
-  
+
   HANDLE_TH_ERRORS
   if (!THPVariable_Check(_new_state)) {
     throw torch::TypeError("expected a torch.ByteTensor, but got %s", Py_TYPE(_new_state)->tp_name);
@@ -98,11 +98,11 @@ static PyObject * THPGenerator_setState(PyObject *_self, PyObject *_new_state)
   auto self = (THPGenerator*)_self;
   auto& gen = self->cdata;
   auto& new_state_tensor = ((THPVariable*)_new_state)->cdata;
-  
+
   // See Note [Acquire lock when using random generators]
   std::lock_guard<std::mutex> lock(gen.mutex());
   gen.set_state(new_state_tensor);
-  
+
   Py_INCREF(self);
   return (PyObject*)self;
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 4001ec86671..94e554e0c46 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -489,7 +489,7 @@ PyObject *THPModule_warnAlways(PyObject *_unused, PyObject *noargs)
 {
   if (c10::Warning::get_warnAlways()) {
     Py_RETURN_TRUE;
-  } 
+  }
   Py_RETURN_FALSE;
 }
 
diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h
index 9c7db7eb2dd..237b3acc691 100644
--- a/torch/csrc/api/include/torch/nn/functional/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -110,7 +110,7 @@ inline Tensor interpolate(
   if (input.dim() == 3 && c10::get_if<enumtype::kNearest>(&mode)) {
     return torch::upsample_nearest1d(input, _interp_output_size(1, closed_over_args), scale_factor_list.at(0));
   } else if (input.dim() == 4 && c10::get_if<enumtype::kNearest>(&mode)) {
-    return torch::upsample_nearest2d(input, _interp_output_size(2, closed_over_args), 
+    return torch::upsample_nearest2d(input, _interp_output_size(2, closed_over_args),
                                      scale_factor_list.at(0), scale_factor_list.at(1));
   } else if (input.dim() == 5 && c10::get_if<enumtype::kNearest>(&mode)) {
     return torch::upsample_nearest3d(input, _interp_output_size(3, closed_over_args),
@@ -132,7 +132,7 @@ inline Tensor interpolate(
     TORCH_CHECK(false, "Got 4D input, but linear mode needs 3D input");
   } else if (input.dim() == 4 && c10::get_if<enumtype::kBilinear>(&mode)) {
     TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
-    return torch::upsample_bilinear2d(input, _interp_output_size(2, closed_over_args), *align_corners, 
+    return torch::upsample_bilinear2d(input, _interp_output_size(2, closed_over_args), *align_corners,
                                       scale_factor_list.at(0), scale_factor_list.at(1));
   } else if (input.dim() == 4 && c10::get_if<enumtype::kTrilinear>(&mode)) {
     TORCH_CHECK(false, "Got 4D input, but trilinear mode needs 5D input");
@@ -146,7 +146,7 @@ inline Tensor interpolate(
                                        scale_factor_list.at(0), scale_factor_list.at(1), scale_factor_list.at(2));
   } else if (input.dim() == 4 && c10::get_if<enumtype::kBicubic>(&mode)) {
     TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
-    return torch::upsample_bicubic2d(input, _interp_output_size(2, closed_over_args), *align_corners, 
+    return torch::upsample_bicubic2d(input, _interp_output_size(2, closed_over_args), *align_corners,
                                      scale_factor_list.at(0), scale_factor_list.at(1));
   } else {
     TORCH_CHECK(
diff --git a/torch/csrc/api/include/torch/nn/modules/_functions.h b/torch/csrc/api/include/torch/nn/modules/_functions.h
index 131181897f2..2c6b7ee4df1 100644
--- a/torch/csrc/api/include/torch/nn/modules/_functions.h
+++ b/torch/csrc/api/include/torch/nn/modules/_functions.h
@@ -15,7 +15,7 @@ class CrossMapLRN2d : public torch::autograd::Function<CrossMapLRN2d> {
       torch::autograd::AutogradContext *ctx,
       const torch::autograd::Variable& input,
       const CrossMapLRN2dOptions& options);
-  
+
   static torch::autograd::variable_list backward(
     torch::autograd::AutogradContext *ctx, torch::autograd::variable_list grad_output);
 };
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index 5a4c4345b8e..b9db0ce3833 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -259,11 +259,11 @@ class TORCH_API SoftminImpl : public torch::nn::Cloneable<SoftminImpl> {
  public:
   explicit SoftminImpl(int64_t dim) : SoftminImpl(SoftminOptions(dim)) {}
   explicit SoftminImpl(const SoftminOptions& options_);
-  
+
   Tensor forward(const Tensor& input);
 
   void reset() override;
-  
+
   /// Pretty prints the `Softmin` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/adaptive.h b/torch/csrc/api/include/torch/nn/modules/adaptive.h
index 9722fbec919..172f8cd4bc7 100644
--- a/torch/csrc/api/include/torch/nn/modules/adaptive.h
+++ b/torch/csrc/api/include/torch/nn/modules/adaptive.h
@@ -5,7 +5,7 @@
 #include <torch/nn/modules/linear.h>
 #include <torch/nn/modules/container/modulelist.h>
 #include <torch/nn/modules/container/sequential.h>
-#include <torch/nn/functional/activation.h> 
+#include <torch/nn/functional/activation.h>
 #include <torch/nn/options/adaptive.h>
 
 namespace torch {
@@ -42,7 +42,7 @@ class TORCH_API AdaptiveLogSoftmaxWithLossImpl : public Cloneable<AdaptiveLogSof
  public:
    AdaptiveLogSoftmaxWithLossImpl(int64_t in_features, int64_t n_classes, std::vector<int64_t> cutoffs)
       : AdaptiveLogSoftmaxWithLossImpl(AdaptiveLogSoftmaxWithLossOptions(in_features, n_classes, cutoffs)) {}
-     
+
   explicit AdaptiveLogSoftmaxWithLossImpl(AdaptiveLogSoftmaxWithLossOptions options_);
 
   ASMoutput forward(const Tensor& input, const Tensor& target);
@@ -66,12 +66,12 @@ class TORCH_API AdaptiveLogSoftmaxWithLossImpl : public Cloneable<AdaptiveLogSof
   /// The options with which this `Module` was constructed
   AdaptiveLogSoftmaxWithLossOptions options;
 
-  /// Cutoffs used to assign targets to their buckets. It should be an ordered Sequence 
+  /// Cutoffs used to assign targets to their buckets. It should be an ordered Sequence
   /// of integers sorted in the increasing order
   std::vector<int64_t> cutoffs;
 
   int64_t shortlist_size;
-  
+
   /// Number of clusters
   int64_t n_clusters;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/moduledict.h b/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
index 67fb343fb6f..b733e3f9f33 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
@@ -37,11 +37,11 @@ namespace nn {
 ///
 /// \endrst
 ///
-/// Why should you use `ModuleDict` instead of a simple `map` or `OrderedDict`? 
-/// The value a `ModuleDict` provides over manually calling an ordered map of 
+/// Why should you use `ModuleDict` instead of a simple `map` or `OrderedDict`?
+/// The value a `ModuleDict` provides over manually calling an ordered map of
 /// modules is that it allows treating the whole container *as a single module*,
 /// such that performing a transformation on the `ModuleDict` applies to each of the
-/// modules it stores (which are each a registered submodule of the `ModuleDict`). 
+/// modules it stores (which are each a registered submodule of the `ModuleDict`).
 /// For example, calling `.to(torch::kCUDA)` on a `ModuleDict` will move each module
 /// in the map to CUDA memory. For example:
 ///
@@ -62,7 +62,7 @@ namespace nn {
 ///
 /// Finally, `ModuleDict` provides a lightweight container API, such as allowing
 /// iteration over submodules, positional access, adding new modules from a vector
-/// of key-module pairs or an `OrderedDict` or another `ModuleDict` after 
+/// of key-module pairs or an `OrderedDict` or another `ModuleDict` after
 /// construction via `update`.
 class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
  public:
@@ -160,7 +160,7 @@ class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
   }
 
   /// Attempts to returns the `Module` associated with the given `key`. Throws
-  /// an exception if no such `key` is stored in the `ModuleDict`. Check 
+  /// an exception if no such `key` is stored in the `ModuleDict`. Check
   /// contains(key) before for a non-throwing way of access.
   std::shared_ptr<Module> operator[](const std::string& key) const {
     return modules_[key];
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
index 3908c830d66..c002550e65e 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -112,28 +112,28 @@ class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
   }
 
   /// Returns the value associated with the given `key`. Throws an exception if
-  /// no such key is stored in the `ParameterDict`. Check contains(key) before 
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
   /// for a non-throwing way of access
   const Tensor& get(const std::string& key) const {
     return parameters_[key];
   }
 
   /// Returns the value associated with the given `key`. Throws an exception if
-  /// no such key is stored in the `ParameterDict`. Check contains(key) before 
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
   /// for a non-throwing way of access
   Tensor& get(const std::string& key) {
     return parameters_[key];
   }
 
   /// Returns the value associated with the given `key`. Throws an exception if
-  /// no such key is stored in the `ParameterDict`. Check contains(key) before 
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
   /// for a non-throwing way of access
   Tensor& operator[](const std::string& key) {
     return parameters_[key];
   }
 
   /// Returns the value associated with the given `key`. Throws an exception if
-  /// no such key is stored in the `ParameterDict`. Check contains(key) before 
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
   /// for a non-throwing way of access
   const Tensor& operator[](const std::string& key) const {
     return parameters_[key];
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
index 472d8e82758..30b7eb89e48 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -62,7 +62,7 @@ class ParameterListImpl : public Cloneable<ParameterListImpl> {
 
   /// push the a given parameter at the end of the list
   /// And the key of the pair will be discarded, only the value
-  /// will be added into the `ParameterList`  
+  /// will be added into the `ParameterList`
   void append(const OrderedDict<std::string, torch::Tensor>::Item& pair) {
     register_parameter(
         c10::to_string(parameters_.size()),
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
index 0a3976e4850..cf7073c38c5 100644
--- a/torch/csrc/api/include/torch/nn/modules/linear.h
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -184,8 +184,8 @@ class TORCH_API BilinearImpl : public Cloneable<BilinearImpl> {
   /// Pretty prints the `Bilinear` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
 
-  /// Applies a bilinear transform on the `input1` and `input2` tensor by multiplying 
-  /// with the `weight` and optionally adding the `bias`, if `with_bias` 
+  /// Applies a bilinear transform on the `input1` and `input2` tensor by multiplying
+  /// with the `weight` and optionally adding the `bias`, if `with_bias`
   /// is true in the options.
   Tensor forward(const Tensor& input1, const Tensor& input2);
 
diff --git a/torch/csrc/api/include/torch/nn/utils/rnn.h b/torch/csrc/api/include/torch/nn/utils/rnn.h
index 9eebf3602d8..99e92011a5c 100644
--- a/torch/csrc/api/include/torch/nn/utils/rnn.h
+++ b/torch/csrc/api/include/torch/nn/utils/rnn.h
@@ -18,19 +18,19 @@ inline Tensor invert_permutation(const Tensor& permutation) {
 }
 
 /// Holds the data and list of `batch_sizes` of a packed sequence.
-/// 
+///
 /// All RNN modules accept packed sequences as inputs.
-/// 
+///
 /// Note:
 ///     Instances of this class should never be created manually. They are meant
 ///     to be instantiated by functions like `pack_padded_sequence`.
-/// 
+///
 ///     Batch sizes represent the number elements at each sequence step in
 ///     the batch, not the varying sequence lengths passed to
 ///     `pack_padded_sequence`.  For instance, given data ``abc`` and ``x``
 ///     the :class:`PackedSequence` would contain data ``axbc`` with
 ///     ``batch_sizes=[2,1,1]``.
-/// 
+///
 /// Attributes:
 ///     data (Tensor): Tensor containing packed sequence
 ///     batch_sizes (Tensor): Tensor of integers holding
@@ -39,14 +39,14 @@ inline Tensor invert_permutation(const Tensor& permutation) {
 ///         :class:`PackedSequence` is constructed from sequences.
 ///     unsorted_indices (Tensor, optional): Tensor of integers holding how this
 ///         to recover the original sequences with correct order.
-/// 
+///
 /// .. note::
 ///     `data` can be on arbitrary device and of arbitrary dtype.
 ///     `sorted_indices` and `unsorted_indices` must be ``torch::kInt64``
 ///     tensors on the same device as `data`.
-/// 
+///
 ///     However, `batch_sizes` should always be a CPU ``torch::kInt64`` tensor.
-/// 
+///
 ///     This invariant is maintained throughout `PackedSequence` class,
 ///     and all functions that construct a `PackedSequence` in libtorch
 ///     (i.e., they only pass in tensors conforming to this constraint).
@@ -151,23 +151,23 @@ class PackedSequence {
 };
 
 /// Packs a Tensor containing padded sequences of variable length.
-/// 
+///
 /// `input` can be of size ``T x B x *`` where `T` is the length of the
 /// longest sequence (equal to ``lengths[0]``), ``B`` is the batch size, and
 /// ``*`` is any number of dimensions (including 0). If ``batch_first`` is
 /// ``true``, ``B x T x *`` `input` is expected.
-/// 
+///
 /// For unsorted sequences, use `enforce_sorted = false`. If `enforce_sorted` is
 /// ``true``, the sequences should be sorted by length in a decreasing order, i.e.
 /// ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the shortest
 /// one.
-/// 
+///
 /// Note:
 ///     This function accepts any input that has at least two dimensions. You
 ///     can apply it to pack the labels, and use the output of the RNN with
 ///     them to compute the loss directly. A Tensor can be retrieved from
 ///     a `PackedSequence` object by calling its ``.data()`` function.
-/// 
+///
 /// Arguments:
 ///     input (Tensor): padded batch of variable length sequences.
 ///     lengths (Tensor): list of sequences lengths of each batch element.
@@ -176,7 +176,7 @@ class PackedSequence {
 ///     enforce_sorted (bool, optional): if ``true``, the input is expected to
 ///         contain sequences sorted by length in a decreasing order. If
 ///         ``false``, this condition is not checked. Default: ``true``.
-/// 
+///
 /// Returns:
 ///     a `PackedSequence` object
 inline PackedSequence pack_padded_sequence(
@@ -201,15 +201,15 @@ inline PackedSequence pack_padded_sequence(
 }
 
 /// Pads a packed batch of variable length sequences.
-/// 
+///
 /// It is an inverse operation to `pack_padded_sequence`.
-/// 
+///
 /// The returned Tensor's data will be of size ``T x B x *``, where `T` is the length
 /// of the longest sequence and `B` is the batch size. If ``batch_first`` is true,
 /// the data will be transposed into ``B x T x *`` format.
-/// 
+///
 /// Batch elements will be ordered decreasingly by their length.
-/// 
+///
 /// Arguments:
 ///     sequence (PackedSequence): batch to pad
 ///     batch_first (bool, optional): if ``true``, the output will be in ``B x T x *``
@@ -219,7 +219,7 @@ inline PackedSequence pack_padded_sequence(
 ///         have length `total_length`. This method will throw error
 ///         if `total_length` is less than the max sequence length in
 ///         `sequence`.
-/// 
+///
 /// Returns:
 ///     Tuple of Tensor containing the padded sequence, and a Tensor
 ///     containing the list of lengths of each sequence in the batch.
@@ -314,21 +314,21 @@ inline Tensor pad_sequence(
 }
 
 /// Packs a list of variable length Tensors
-/// 
+///
 /// ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is
 /// the length of a sequence and `*` is any number of trailing dimensions,
 /// including zero.
-/// 
+///
 /// For unsorted sequences, use `enforce_sorted = false`. If ``enforce_sorted``
 /// is ``true``, the sequences should be sorted in the order of decreasing length.
-/// 
-/// 
+///
+///
 /// Arguments:
 ///     sequences (torch::ArrayRef<Tensor>): A list of sequences of decreasing length.
 ///     enforce_sorted (bool, optional): if ``true``, checks that the input
 ///         contains sequences sorted by length in a decreasing order. If
 ///         ``false``, this condition is not checked. Default: ``true``.
-/// 
+///
 /// Returns:
 ///     a `PackedSequence` object
 inline PackedSequence pack_sequence(ArrayRef<Tensor> sequences, bool enforce_sorted = true) {
diff --git a/torch/csrc/api/src/data/samplers/distributed.cpp b/torch/csrc/api/src/data/samplers/distributed.cpp
index 5a7eadc45e2..edd983ec166 100644
--- a/torch/csrc/api/src/data/samplers/distributed.cpp
+++ b/torch/csrc/api/src/data/samplers/distributed.cpp
@@ -42,9 +42,9 @@ optional<std::vector<size_t>> DistributedRandomSampler::next(
 }
 
 void DistributedRandomSampler::reset(optional<size_t> new_size) {
-  size_ = new_size.value_or(size_);  
+  size_ = new_size.value_or(size_);
   populate_indices();
-  
+
   std::mt19937 rand(epoch_);
   std::shuffle(all_indices_.begin(), all_indices_.end(), rand);
   sample_index_ = begin_index_;
@@ -78,7 +78,7 @@ void DistributedRandomSampler::save(serialize::OutputArchive& archive) const {
 }
 
 void DistributedRandomSampler::load(serialize::InputArchive& archive) {
-  auto tensor = torch::empty(1, torch::kInt64);  
+  auto tensor = torch::empty(1, torch::kInt64);
   archive.read("epoch_", tensor, /*is_buffer=*/true);
   epoch_ = tensor.item<int64_t>();
   // call reset() after loading epoch_ to populate indices.
diff --git a/torch/csrc/api/src/nn/modules/_functions.cpp b/torch/csrc/api/src/nn/modules/_functions.cpp
index c15655a347e..9f1f39530e4 100644
--- a/torch/csrc/api/src/nn/modules/_functions.cpp
+++ b/torch/csrc/api/src/nn/modules/_functions.cpp
@@ -7,7 +7,7 @@ namespace nn {
 namespace functions {
 
 Variable CrossMapLRN2d::forward(
-    AutogradContext *ctx, 
+    AutogradContext *ctx,
     const Variable& input,
     const CrossMapLRN2dOptions& options){
   ctx->saved_data["size"] = options.size();
@@ -19,7 +19,7 @@ Variable CrossMapLRN2d::forward(
   TORCH_CHECK(input.dim() == 4);
 
   ctx->saved_data["scale"] = ctx->saved_data["scale"].toTensor().defined() ? ctx->saved_data["scale"] : torch::empty({0}, input.options());
-  
+
   torch::Tensor output = torch::empty({0}, input.options());
 
   int64_t batch_size = input.size(0);
@@ -87,16 +87,16 @@ variable_list CrossMapLRN2d::backward(AutogradContext *ctx, variable_list grad_o
   int64_t input_height = input.size(2);
   int64_t input_width = input.size(3);
 
-  auto padded_ratio = torch::empty({channels + ctx->saved_data["size"].toInt() - 1, input_height, input_width}, 
+  auto padded_ratio = torch::empty({channels + ctx->saved_data["size"].toInt() - 1, input_height, input_width},
                                     input.options());
-  auto accum_ratio = torch::empty({input_height, input_width}, 
+  auto accum_ratio = torch::empty({input_height, input_width},
                                     input.options());
   double cache_ratio_value = 2 * ctx->saved_data["alpha"].toDouble() * ctx->saved_data["beta"].toDouble() / ctx->saved_data["size"].toInt();
   int64_t inversePrePad = static_cast<int64_t>(ctx->saved_data["size"].toInt() - (ctx->saved_data["size"].toInt() - 1) / 2);
 
   grad_input.resize_as_(input);
   torch::pow_out(grad_input, ctx->saved_data["scale"].toTensor(), -ctx->saved_data["beta"].toDouble()).mul_(grad_output);
-  
+
   padded_ratio.zero_();
   auto padded_ratio_center = padded_ratio.narrow(0, inversePrePad, channels);
 
@@ -104,7 +104,7 @@ variable_list CrossMapLRN2d::backward(AutogradContext *ctx, variable_list grad_o
     torch::mul_out(padded_ratio_center, grad_output[n], output[n]);
     padded_ratio_center.div_(ctx->saved_data["scale"].toTensor()[n]);
     torch::sum_out(
-        accum_ratio, 
+        accum_ratio,
         padded_ratio.narrow(0, 0, ctx->saved_data["size"].toInt() - 1),
         0, /*keepdim=*/false);
     for (int64_t c = 0; c < channels; ++c) {
@@ -113,7 +113,7 @@ variable_list CrossMapLRN2d::backward(AutogradContext *ctx, variable_list grad_o
       accum_ratio.add_(padded_ratio[c], -1);
     }
   }
-  
+
   return variable_list{grad_input, Variable(), Variable(), Variable(), Variable()};
 }
 
diff --git a/torch/csrc/api/src/nn/modules/adaptive.cpp b/torch/csrc/api/src/nn/modules/adaptive.cpp
index 62e5d005551..db97b6994a5 100644
--- a/torch/csrc/api/src/nn/modules/adaptive.cpp
+++ b/torch/csrc/api/src/nn/modules/adaptive.cpp
@@ -159,7 +159,7 @@ Tensor AdaptiveLogSoftmaxWithLossImpl::predict(const Tensor& input) {
 }
 
 void AdaptiveLogSoftmaxWithLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::AdaptiveLogSoftmaxWithLoss"; 
+  stream << "torch::nn::AdaptiveLogSoftmaxWithLoss";
 }
 
 } // namespace nn
diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp
index 96d7c5354d9..da16b3a0a89 100644
--- a/torch/csrc/api/src/nn/modules/batchnorm.cpp
+++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp
@@ -16,7 +16,7 @@ namespace F = torch::nn::functional;
 namespace torch {
 namespace nn {
 
-template <size_t D, typename Derived> 
+template <size_t D, typename Derived>
 void BatchNormImplBase<D, Derived>::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha
          << "torch::nn::BatchNorm" << D << "d("
diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp
index ebf5c3706eb..6d15605b2d4 100644
--- a/torch/csrc/api/src/nn/modules/pooling.cpp
+++ b/torch/csrc/api/src/nn/modules/pooling.cpp
@@ -268,7 +268,7 @@ void FractionalMaxPool2dImpl::reset() {
     if (!(0 < output_ratio[0] && output_ratio[0] < 1 &&
           0 < output_ratio[1] && output_ratio[1] < 1)) {
       TORCH_CHECK(false, "output_ratio must be between 0 and 1 (got ", output_ratio, ")");
-    }           
+    }
   }
 }
 
@@ -306,11 +306,11 @@ void FractionalMaxPool3dImpl::reset() {
   }
   if (options.output_ratio() != c10::nullopt) {
     at::ArrayRef<double> output_ratio = at::ArrayRef<double>(options.output_ratio().value());
-    if (!(0 < output_ratio[0] && output_ratio[0] < 1 && 
+    if (!(0 < output_ratio[0] && output_ratio[0] < 1 &&
           0 < output_ratio[1] && output_ratio[1] < 1 &&
           0 < output_ratio[2] && output_ratio[2] < 1)) {
       TORCH_CHECK(false, "output_ratio must be between 0 and 1 (got ", output_ratio, ")");
-    }           
+    }
   }
 }
 
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index 615625ee470..5d71a3d74aa 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -89,7 +89,7 @@ struct THPFunction {
     // modified inplace.
     PyObject *dirty_tensors;
 
-    // boolean indicating whether to materialize undefined output grad tensors 
+    // boolean indicating whether to materialize undefined output grad tensors
     // into tensors full of zeros. Set by Python with 'set_materialize_grads'.
     // Default is true.
     bool materialize_grads;
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index f28ef0f67a3..2d2d1251228 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -462,7 +462,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(const Tenso
       return diff_view_meta->grad_fn_;
     }
   }
-  
+
   if (torch::autograd::impl::get_autograd_meta(self)) {
     return torch::autograd::impl::get_autograd_meta(self)->grad_fn_;
   } else {
diff --git a/torch/csrc/jit/codegen/fuser/README.md b/torch/csrc/jit/codegen/fuser/README.md
index a4cd1a1c01e..e115f999188 100644
--- a/torch/csrc/jit/codegen/fuser/README.md
+++ b/torch/csrc/jit/codegen/fuser/README.md
@@ -1,16 +1,16 @@
 # PyTorch Fuser
 
-The fuser accepts subgraphs wrapped in "fusion nodes" and tries to execute them by just-in-time (JIT) compiling kernels that run all the graph operations. 
+The fuser accepts subgraphs wrapped in "fusion nodes" and tries to execute them by just-in-time (JIT) compiling kernels that run all the graph operations.
 
 ## Code Organization
 
 The fuser is designed hierarchically with device-independent logic eventually deferring to device-specific logic and implementation. The device-specific code is (mostly) found in each devices' subdirectory. The device-independent logic has six components:
 
-* The Interface (interface.h/cpp) has functions to register and run fusions, interrogate fusion functionality, and perform debugging. 
-* The Compiler (compiler.h/cpp) performs "upfront" and "runtime" compilation. When fusions are registered, upfront compilation produces fallback code and and performs some shape inference. When a fusion is run, runtime compilation invokes code generation and the device-specific compilation logic. 
+* The Interface (interface.h/cpp) has functions to register and run fusions, interrogate fusion functionality, and perform debugging.
+* The Compiler (compiler.h/cpp) performs "upfront" and "runtime" compilation. When fusions are registered, upfront compilation produces fallback code and and performs some shape inference. When a fusion is run, runtime compilation invokes code generation and the device-specific compilation logic.
 * The Code Generator (codegen.h/cpp) produces the string to be compiled on the device.
 * The Executor (executor.h/cpp) runs requested fusions. It performs shape inference, expands tensors as necessary, determines the device to run on, acquires a cached compiled kernel or requests the Compiler produce a new one, invokes device-specific code to launch the kernel and updates the stack.
 * The Fallback (fallback.h/cpp) runs subgraphs that can't be fused because shape inference didn't determine a common tensor size or the device the tensors are on doesn't support fusion.
 * The Kernel Specification Cache (kernel_cache.h/cpp) is a thread-safe cache holding the device-independent specifications produced during upfront compilation. These specifications each have their own thread-safe stores of compiled kernels that the Executor checks before requesting runtime compilation.
 
-The device-specific components have logic for compiling and running code in FusedKernelCPU (cpu/fused_kernel.h/cpp) and FusedKernelCUDA (cuda/fused_kernel.h/cpp). 
\ No newline at end of file
+The device-specific components have logic for compiling and running code in FusedKernelCPU (cpu/fused_kernel.h/cpp) and FusedKernelCUDA (cuda/fused_kernel.h/cpp).
diff --git a/torch/csrc/utils/cuda_lazy_init.cpp b/torch/csrc/utils/cuda_lazy_init.cpp
index 4af06e03851..b5b97e89eb6 100644
--- a/torch/csrc/utils/cuda_lazy_init.cpp
+++ b/torch/csrc/utils/cuda_lazy_init.cpp
@@ -7,7 +7,7 @@
 #include <torch/csrc/utils/object_ptr.h>
 namespace torch {
 namespace utils {
-  
+
 static bool run_yet = false;
 
 void cuda_lazy_init() {
diff --git a/torch/distributed/nn/functional.py b/torch/distributed/nn/functional.py
index feb69df4984..7e03fc6e572 100644
--- a/torch/distributed/nn/functional.py
+++ b/torch/distributed/nn/functional.py
@@ -90,7 +90,7 @@ def all_gather(tensor, group=dist.group.WORLD):
     Returns:
         tuple[Tensor]): Output of the collective.
 
-    """    
+    """
     return _AllGather.apply(group, tensor)
 
 
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index 75e7c816ad7..6200f6df8e5 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
 # in ScriptModule or pass it to a ScriptFunction
 # _ScriptLocalOptimizerInterface serves as a common
 # interface type for Optimizer ScriptModules.
-# 
+#
 # TODO (wanchaol): remove this once we added TorchScript
 # class reference semantics
 @jit.interface
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index 716bfbd8c7a..daa0c44f634 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -107,7 +107,7 @@ class MixtureSameFamily(Distribution):
 
     @constraints.dependent_property
     def support(self):
-        # FIXME this may have the wrong shape when support contains batched 
+        # FIXME this may have the wrong shape when support contains batched
         # parameters
         return self._component_distribution.support
 
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 0f275e7b915..b6bffafcbe8 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -519,7 +519,7 @@ class Graph:
 
             return True
 
-        if (self.owning_module and 
+        if (self.owning_module and
                 not _get_attr_reference_exists(self.owning_module, qualified_name)):
             warnings.warn("Attempted to insert a get_attr Node with no "
                           "underlying reference in the owning "
@@ -567,7 +567,7 @@ class Graph:
             The same insertion point and type expression rules apply for this method
             as :meth:`Graph.create_node`.
         """
-        if (self.owning_module and 
+        if (self.owning_module and
                 self.owning_module.get_submodule(module_name) is not None):
             warnings.warn("Attempted to insert a get_attr Node with no "
                           "underlying reference in the owning "
@@ -960,7 +960,7 @@ def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}:
         particular:
         - Checks Nodes have correct ownership (owned by this graph)
         - Checks Nodes appear in topological order
-        - If this Graph has an owning GraphModule, checks that targets 
+        - If this Graph has an owning GraphModule, checks that targets
         exist in that GraphModule
         """
 
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index c8f28ff71a8..c2df17ef643 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -314,7 +314,7 @@ class {module_name}(torch.nn.Module):
         """
         Adds the given submodule to ``self``.
 
-        This installs empty Modules where none exist yet if they are 
+        This installs empty Modules where none exist yet if they are
         subpaths of ``target``.
 
         Args:
@@ -401,7 +401,7 @@ class {module_name}(torch.nn.Module):
         true:
         1. It has children that are used
         2. Its forward is called directly via a ``call_module`` node
-        3. It has a non-Module attribute that is used from a 
+        3. It has a non-Module attribute that is used from a
         ``get_attr`` node
 
         This method can be called to clean up an ``nn.Module`` without
@@ -425,9 +425,9 @@ class {module_name}(torch.nn.Module):
                     return '.'.join([x, y] if y else [x])
 
                 # Progressively collect all the names of intermediate
-                # modules. For example, if we have the target 
-                # `foo.bar.baz`, we'll add `foo`, `foo.bar`, and 
-                # `foo.bar.baz` to the list. 
+                # modules. For example, if we have the target
+                # `foo.bar.baz`, we'll add `foo`, `foo.bar`, and
+                # `foo.bar.baz` to the list.
                 for path in itertools.accumulate(fullpath, join_fn):
                     used.append(path)
 
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index 9ccec7bf7f1..c9d8e6745e9 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -26,7 +26,7 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics:
         preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method.
         Attributes modified in preserved methods will also be preserved.
 
-        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly 
+        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
         preserve numerics. Full details of optimization can be found at `torch.jit.optimize_frozen_module`.
 
     Returns:
@@ -117,10 +117,10 @@ def optimize_frozen_module(mod, optimize_numerics: bool = True):
     Args:
         mod (:class:`ScriptModule`): a frozen module to be optimized
 
-        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly 
-        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose` 
-        when applied on a single transformation, however in a module where many transformations are applied 
-        the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding, 
+        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
+        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose`
+        when applied on a single transformation, however in a module where many transformations are applied
+        the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding,
         Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics.
 
     Returns:
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index c7a0c079590..b84d0bad477 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -86,7 +86,7 @@ Examples::
 inv = _add_docstr(_linalg.linalg_inv, r"""
 linalg.inv(input, *, out=None) -> Tensor
 
-Computes the multiplicative inverse matrix of a square matrix :attr:`input`, or of each square matrix in a 
+Computes the multiplicative inverse matrix of a square matrix :attr:`input`, or of each square matrix in a
 batched :attr:`input`. The result satisfies the relation:
 
 ``matmul(inv(input),input)`` = ``matmul(input,inv(input))`` = ``eye(input.shape[0]).expand_as(input)``.
@@ -253,7 +253,7 @@ Supports input of float, double, cfloat and cdouble dtypes.
 
 .. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU.
 
-.. note:: The eigenvalues/eigenvectors are computed using LAPACK's `syevd` and `heevd` routines for CPU inputs, 
+.. note:: The eigenvalues/eigenvectors are computed using LAPACK's `syevd` and `heevd` routines for CPU inputs,
           and MAGMA's `syevd` and `heevd` routines for CUDA inputs.
 
 .. note:: The eigenvalues of real symmetric or complex Hermitian matrices are always real.
@@ -319,7 +319,7 @@ Supports input of float, double, cfloat and cdouble dtypes.
 
 .. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU.
 
-.. note:: The eigenvalues are computed using LAPACK's `syevd` and `heevd` routines for CPU inputs, 
+.. note:: The eigenvalues are computed using LAPACK's `syevd` and `heevd` routines for CPU inputs,
           and MAGMA's `syevd` and `heevd` routines for CUDA inputs.
 
 .. note:: The eigenvalues of real symmetric or complex Hermitian matrices are always real.
@@ -705,7 +705,7 @@ Keyword args:
     out (Tensor, optional): tensor to write the output to. Default is ``None``.
 
 Returns:
-    The condition number of :attr:`input`. The output dtype is always real valued 
+    The condition number of :attr:`input`. The output dtype is always real valued
     even for complex inputs (e.g. float if :attr:`input` is cfloat).
 
 Examples::
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index e2b6d2cc135..da005753a80 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -565,7 +565,7 @@ class SyncBatchNorm(_BatchNorm):
         >>> # creating process group (optional)
         >>> # ranks is a list of int identifying rank ids.
         >>> ranks = list(range(8))
-        >>> r1, r2 = ranks[:4], ranks[4:] 
+        >>> r1, r2 = ranks[:4], ranks[4:]
         >>> # Note: every rank calls into new_group for every
         >>> # process group created, even if that rank is not
         >>> # part of the group.
@@ -706,7 +706,7 @@ class SyncBatchNorm(_BatchNorm):
             >>> # creating process group (optional)
             >>> # ranks is a list of int identifying rank ids.
             >>> ranks = list(range(8))
-            >>> r1, r2 = ranks[:4], ranks[4:] 
+            >>> r1, r2 = ranks[:4], ranks[4:]
             >>> # Note: every rank calls into new_group for every
             >>> # process group created, even if that rank is not
             >>> # part of the group.
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 5b7b636b23e..8a7ec5de22e 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -244,9 +244,9 @@ class ModuleDict(Module):
 
     * the order of insertion, and
 
-    * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged 
+    * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged
       ``OrderedDict``, ``dict`` (started from Python 3.6) or another
-      :class:`~torch.nn.ModuleDict` (the argument to 
+      :class:`~torch.nn.ModuleDict` (the argument to
       :meth:`~torch.nn.ModuleDict.update`).
 
     Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 7e857fde59d..c7b04bfa9ea 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -688,7 +688,7 @@ class ConvTranspose1d(_ConvTransposeNd):
         if self.padding_mode != 'zeros':
             raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
 
-        # One cannot replace List by Tuple or Sequence in "_output_padding" because 
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
         # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
         output_padding = self._output_padding(
             input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore
@@ -832,7 +832,7 @@ class ConvTranspose2d(_ConvTransposeNd):
         if self.padding_mode != 'zeros':
             raise ValueError('Only `zeros` padding mode is supported for ConvTranspose2d')
 
-        # One cannot replace List by Tuple or Sequence in "_output_padding" because 
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
         # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
         output_padding = self._output_padding(
             input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore
@@ -973,7 +973,7 @@ class ConvTranspose3d(_ConvTransposeNd):
         if self.padding_mode != 'zeros':
             raise ValueError('Only `zeros` padding mode is supported for ConvTranspose3d')
 
-        # One cannot replace List by Tuple or Sequence in "_output_padding" because 
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
         # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
         output_padding = self._output_padding(
             input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore
@@ -1015,7 +1015,7 @@ class _LazyConvXdMixin(LazyModuleMixin):
     groups: int
     transposed: bool
     in_channels: int
-    out_channels: int 
+    out_channels: int
     kernel_size: Tuple[int, ...]
     weight: UninitializedParameter
     bias: UninitializedParameter
@@ -1048,7 +1048,7 @@ class _LazyConvXdMixin(LazyModuleMixin):
             self.reset_parameters()
 
 
-# LazyConv1d defines weight as a Tensor but derived class defines it as UnitializeParameter 
+# LazyConv1d defines weight as a Tensor but derived class defines it as UnitializeParameter
 class LazyConv1d(_LazyConvXdMixin, Conv1d):  # type: ignore[misc]
     r"""A :class:`torch.nn.Conv1d` module with lazy initialization of
     the ``in_channels`` argument of the :class:`Conv1d` that is inferred from
@@ -1107,7 +1107,7 @@ class LazyConv1d(_LazyConvXdMixin, Conv1d):  # type: ignore[misc]
             self.bias = UninitializedParameter()
 
 
-# LazyConv2d defines weight as a Tensor but derived class defines it as UnitializeParameter 
+# LazyConv2d defines weight as a Tensor but derived class defines it as UnitializeParameter
 class LazyConv2d(_LazyConvXdMixin, Conv2d):  # type: ignore[misc]
     r"""A :class:`torch.nn.Conv2d` module with lazy initialization of
     the ``in_channels`` argument of the :class:`Conv2d` that is inferred from
@@ -1166,7 +1166,7 @@ class LazyConv2d(_LazyConvXdMixin, Conv2d):  # type: ignore[misc]
             self.bias = UninitializedParameter()
 
 
-# LazyConv3d defines weight as a Tensor but derived class defines it as UnitializeParameter 
+# LazyConv3d defines weight as a Tensor but derived class defines it as UnitializeParameter
 class LazyConv3d(_LazyConvXdMixin, Conv3d):  # type: ignore[misc]
     r"""A :class:`torch.nn.Conv3d` module with lazy initialization of
     the ``in_channels`` argument of the :class:`Conv3d` that is inferred from
@@ -1225,7 +1225,7 @@ class LazyConv3d(_LazyConvXdMixin, Conv3d):  # type: ignore[misc]
             self.bias = UninitializedParameter()
 
 
-# LazyConvTranspose1d defines weight as a Tensor but derived class defines it as UnitializeParameter 
+# LazyConvTranspose1d defines weight as a Tensor but derived class defines it as UnitializeParameter
 class LazyConvTranspose1d(_LazyConvXdMixin, ConvTranspose1d):  # type: ignore[misc]
     r"""A :class:`torch.nn.ConvTranspose1d` module with lazy initialization of
     the ``in_channels`` argument of the :class:`ConvTranspose1d` that is inferred from
@@ -1283,7 +1283,7 @@ class LazyConvTranspose1d(_LazyConvXdMixin, ConvTranspose1d):  # type: ignore[mi
             self.bias = UninitializedParameter()
 
 
-# LazyConvTranspose2d defines weight as a Tensor but derived class defines it as UnitializeParameter 
+# LazyConvTranspose2d defines weight as a Tensor but derived class defines it as UnitializeParameter
 class LazyConvTranspose2d(_LazyConvXdMixin, ConvTranspose2d):  # type: ignore[misc]
     r"""A :class:`torch.nn.ConvTranspose2d` module with lazy initialization of
     the ``in_channels`` argument of the :class:`ConvTranspose2d` that is inferred from
@@ -1341,7 +1341,7 @@ class LazyConvTranspose2d(_LazyConvXdMixin, ConvTranspose2d):  # type: ignore[mi
             self.bias = UninitializedParameter()
 
 
-# LazyConvTranspose3d defines weight as a Tensor but derived class defines it as UnitializeParameter 
+# LazyConvTranspose3d defines weight as a Tensor but derived class defines it as UnitializeParameter
 class LazyConvTranspose3d(_LazyConvXdMixin, ConvTranspose3d):  # type: ignore[misc]
     r"""A :class:`torch.nn.ConvTranspose3d` module with lazy initialization of
     the ``in_channels`` argument of the :class:`ConvTranspose3d` that is inferred from
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
index 78af6777ac2..55789bff6bc 100644
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -185,14 +185,14 @@ class AlphaDropout(_DropoutNd):
 
 
 class FeatureAlphaDropout(_DropoutNd):
-    r"""Randomly masks out entire channels (a channel is a feature map, 
-    e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input 
-    is a tensor :math:`\text{input}[i, j]`) of the input tensor). Instead of 
-    setting activations to zero, as in regular Dropout, the activations are set 
+    r"""Randomly masks out entire channels (a channel is a feature map,
+    e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input
+    is a tensor :math:`\text{input}[i, j]`) of the input tensor). Instead of
+    setting activations to zero, as in regular Dropout, the activations are set
     to the negative saturation value of the SELU activation function. More details
     can be found in the paper `Self-Normalizing Neural Networks`_ .
 
-    Each element will be masked independently for each sample on every forward 
+    Each element will be masked independently for each sample on every forward
     call with probability :attr:`p` using samples from a Bernoulli distribution.
     The elements to be masked are randomized on every forward call, and scaled
     and shifted to maintain zero mean and unit variance.
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index dd491ba9962..4167d0044a7 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -53,7 +53,7 @@ class Unflatten(Module):
       be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
 
     * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
-      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape` 
+      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape`
       (tuple of `(name, size)` tuples) for `NamedTensor` input.
 
     Shape:
@@ -112,7 +112,7 @@ class Unflatten(Module):
         if (isinstance(input, tuple)):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, tuple):
-                    raise TypeError("unflattened_size must be tuple of tuples, " + 
+                    raise TypeError("unflattened_size must be tuple of tuples, " +
                                     "but found element of type {} at pos {}".format(type(elem).__name__, idx))
             return
         raise TypeError("unflattened_size must be a tuple of tuples, " +
@@ -122,7 +122,7 @@ class Unflatten(Module):
         if (isinstance(input, (tuple, list))):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, int):
-                    raise TypeError("unflattened_size must be tuple of ints, " + 
+                    raise TypeError("unflattened_size must be tuple of ints, " +
                                     "but found element of type {} at pos {}".format(type(elem).__name__, idx))
             return
         raise TypeError("unflattened_size must be a tuple of ints, but found type {}".format(type(input).__name__))
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index eb2e14bfe7a..533953efcc5 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -7,7 +7,7 @@ from ..parameter import is_lazy
 
 
 class _LazyProtocol(Protocol):
-    """This is to avoid errors with mypy checks for 
+    """This is to avoid errors with mypy checks for
     The attributes in a mixin:
     https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes
     """
@@ -176,7 +176,7 @@ class LazyModuleMixin:
                       'so changes to the API or functionality can happen at any moment.')
 
     def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars):
-        # This should be ideally implemented as a hook, 
+        # This should be ideally implemented as a hook,
         # but we should override `detach` in the UninitializedParameter to return itself
         # which is not clean
         for name, param in self._parameters.items():
@@ -242,7 +242,7 @@ class LazyModuleMixin:
         The module is set into evaluation mode before running the forward pass in order
         to avoid saving statistics or calculating gradients
         """
-        module.initialize_parameters(*input) 
+        module.initialize_parameters(*input)
         if module.has_uninitialized_params():
             raise RuntimeError('module {} has not been fully initialized'.format(self._get_name()))
         module._initialize_hook.remove()
@@ -255,4 +255,4 @@ class LazyModuleMixin:
 
     def _replicate_for_data_parallel(self: _LazyProtocol):
         raise RuntimeError('Modules with uninitialized parameters can\'t be used with `DataParallel`. '
-                           'Run a dummy forward pass to correctly initialize the modules')                    
+                           'Run a dummy forward pass to correctly initialize the modules')
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 01892df1aa2..062fccab1e0 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -401,8 +401,8 @@ class Module:
                 )
             )
 
-        (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested 
-        submodule ``net_b``, which itself has two submodules ``net_c`` 
+        (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested
+        submodule ``net_b``, which itself has two submodules ``net_c``
         and ``linear``. ``net_c`` then has a submodule ``conv``.)
 
         To check whether or not we have the ``linear`` submodule, we
@@ -411,8 +411,8 @@ class Module:
         ``get_submodule("net_b.net_c.conv")``.
 
         The runtime of ``get_submodule`` is bounded by the degree
-        of module nesting in ``target``. A query against 
-        ``named_modules`` achieves the same result, but it is O(N) in 
+        of module nesting in ``target``. A query against
+        ``named_modules`` achieves the same result, but it is O(N) in
         the number of transitive modules. So, for a simple check to see
         if some submodule exists, ``get_submodule`` should always be
         used.
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index acdd2a8c9d4..3574b2ae0e2 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -141,7 +141,7 @@ class LayerNorm(Module):
         >>> output = m(input)
     """
     __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
-    normalized_shape: Tuple[int, ...] 
+    normalized_shape: Tuple[int, ...]
     eps: float
     elementwise_affine: bool
 
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index 186d89c6fd0..5c6539154d9 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -209,7 +209,7 @@ class ReflectionPad1d(_ReflectionPadNd):
                  [7., 6., 5., 4., 5., 6., 7., 6.]]])
 
     """
-    padding: Tuple[int, int] 
+    padding: Tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
         super(ReflectionPad1d, self).__init__()
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index faf59019f2f..c6f50bf29d0 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -95,7 +95,7 @@ class Transformer(Module):
             positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
             while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
             are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
-            is provided, it will be added to the attention weight. 
+            is provided, it will be added to the attention weight.
             [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
             the attention. If a ByteTensor is provided, the non-zero positions will be ignored while the zero
             positions will be unchanged. If a BoolTensor is provided, the positions with the
diff --git a/torch/nn/parallel/scatter_gather.pyi b/torch/nn/parallel/scatter_gather.pyi
index a81851152c8..08cc59fe956 100644
--- a/torch/nn/parallel/scatter_gather.pyi
+++ b/torch/nn/parallel/scatter_gather.pyi
@@ -14,7 +14,7 @@ def scatter(inputs: Tensor, target_gpus: _devices_t, dim: int = ...) -> Tuple[Te
 # untyped module. Thus to mypy, the first definition of `scatter` looks strictly more general
 # than this overload.
 @overload
-def scatter(inputs: T, target_gpus: _devices_t, dim: int = ...) -> List[T]: ...  # type: ignore 
+def scatter(inputs: T, target_gpus: _devices_t, dim: int = ...) -> List[T]: ...  # type: ignore
 
 
 # TODO More precise types here.
diff --git a/torch/nn/parameter.pyi b/torch/nn/parameter.pyi
index ff15a375d50..747a4a46629 100644
--- a/torch/nn/parameter.pyi
+++ b/torch/nn/parameter.pyi
@@ -12,7 +12,7 @@ def is_lazy(param: Tensor): ...
 
 class UninitializedParameter(Tensor):
     def __init__(self, data: Tensor=..., requires_grad: builtins.bool=...): ...
-    
+
     def materialize(self, shape: Tuple[int, ...], device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None): ...
     ...
 
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 3087eeb593e..f38e0acbadc 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -42,60 +42,60 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
 
     Args:
         model (torch.nn.Module): the model to be exported.
-        args (tuple of arguments or torch.Tensor, a dictionary consisting of named arguments (optional)): 
-            a dictionary to specify the input to the corresponding named parameter: 
-            - KEY: str, named parameter 
-            - VALUE: corresponding input 
-            args can be structured either as: 
+        args (tuple of arguments or torch.Tensor, a dictionary consisting of named arguments (optional)):
+            a dictionary to specify the input to the corresponding named parameter:
+            - KEY: str, named parameter
+            - VALUE: corresponding input
+            args can be structured either as:
 
-            1. ONLY A TUPLE OF ARGUMENTS or torch.Tensor:: 
+            1. ONLY A TUPLE OF ARGUMENTS or torch.Tensor::
 
-                ‘’args = (x, y, z)’'  
+                ‘’args = (x, y, z)’'
 
-            The inputs to the model, e.g., such that ``model(*args)`` is a valid invocation 
+            The inputs to the model, e.g., such that ``model(*args)`` is a valid invocation
             of the model. Any non-Tensor arguments will be hard-coded into the exported model;
-            any Tensor arguments will become inputs of the exported model, in the order they 
-            occur in args. If args is a Tensor, this is equivalent to having 
-            called it with a 1-ary tuple of that Tensor. 
+            any Tensor arguments will become inputs of the exported model, in the order they
+            occur in args. If args is a Tensor, this is equivalent to having
+            called it with a 1-ary tuple of that Tensor.
 
-            2. A TUPLE OF ARGUEMENTS WITH A DICTIONARY OF NAMED PARAMETERS:: 
+            2. A TUPLE OF ARGUEMENTS WITH A DICTIONARY OF NAMED PARAMETERS::
 
-                ‘’args = (x, 
-                        { 
-                        ‘y’: input_y, 
-                        ‘z’: input_z 
-                        }) ‘’ 
+                ‘’args = (x,
+                        {
+                        ‘y’: input_y,
+                        ‘z’: input_z
+                        }) ‘’
 
-            The inputs to the model are structured as a tuple consisting of  
-            non-keyword arguments and the last value of this tuple being a dictionary 
-            consisting of named parameters and the corresponding inputs as key-value pairs. 
-            If certain named argument is not present in the dictionary, it is assigned  
-            the default value, or None if default value is not provided. 
+            The inputs to the model are structured as a tuple consisting of
+            non-keyword arguments and the last value of this tuple being a dictionary
+            consisting of named parameters and the corresponding inputs as key-value pairs.
+            If certain named argument is not present in the dictionary, it is assigned
+            the default value, or None if default value is not provided.
 
-            Cases in which an dictionary input is the last input of the args tuple 
-            would cause a conflict when a dictionary of named parameters is used. 
-            The model below provides such an example. 
+            Cases in which an dictionary input is the last input of the args tuple
+            would cause a conflict when a dictionary of named parameters is used.
+            The model below provides such an example.
 
-                class Model(torch.nn.Module): 
-                    def forward(self, k, x): 
-                        ... 
-                        return x 
+                class Model(torch.nn.Module):
+                    def forward(self, k, x):
+                        ...
+                        return x
 
-                m = Model() 
-                k = torch.randn(2, 3)   
-                x = {torch.tensor(1.): torch.randn(2, 3)} 
+                m = Model()
+                k = torch.randn(2, 3)  
+                x = {torch.tensor(1.): torch.randn(2, 3)}
 
                 In the previous iteration, the call to export API would look like
 
-                    torch.onnx.export(model, (k, x), ‘test.onnx’) 
+                    torch.onnx.export(model, (k, x), ‘test.onnx’)
 
-                This would work as intended. However, the export function 
-                would now assume that the ‘x’ input is intended to represent the optional 
-                dictionary consisting of named arguments. In order to prevent this from being 
-                an issue a constraint is placed to provide an empty dictionary as the last 
-                input in the tuple args in such cases. The new call would look like this. 
+                This would work as intended. However, the export function
+                would now assume that the ‘x’ input is intended to represent the optional
+                dictionary consisting of named arguments. In order to prevent this from being
+                an issue a constraint is placed to provide an empty dictionary as the last
+                input in the tuple args in such cases. The new call would look like this.
 
-                    torch.onnx.export(model, (k, x, {}), ‘test.onnx’) 
+                    torch.onnx.export(model, (k, x, {}), ‘test.onnx’)
 
         f: a file-like object (has to implement fileno that returns a file descriptor)
             or a string containing a file name.  A binary Protobuf will be written
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index f9ed767151c..735d8a8ba73 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -304,7 +304,7 @@ def _generate_wrapped_number(g, scalar):
     wrapped as 0-dim int64 tensors and floating-point types are
     wrapped as 0-dim double tensors.
 
-    The input to this function is constant value. If the data type 
+    The input to this function is constant value. If the data type
     is a floating point type, it is converted to a 0-dim double
     tensor, else it is converted to a 0-dim tensor of its original type
     """
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index e27dee0bd27..cabbfeaa6d9 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -186,8 +186,8 @@ def unsafe_chunk(g, self, chunks, dim, _outputs=None):
     if leftover:
         splits.append(leftover)
 
-    # TODO: So far we don't have a module using this method. We'll keep 
-    # this as a constant unless we see a request of dynamics in any 
+    # TODO: So far we don't have a module using this method. We'll keep
+    # this as a constant unless we see a request of dynamics in any
     # user's modules.
     splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
     return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
diff --git a/torch/optim/_multi_tensor/adadelta.py b/torch/optim/_multi_tensor/adadelta.py
index 7c600fafd45..53779fddfdb 100644
--- a/torch/optim/_multi_tensor/adadelta.py
+++ b/torch/optim/_multi_tensor/adadelta.py
@@ -57,7 +57,7 @@ class Adadelta(Optimizer):
             rho, eps = group['rho'], group['eps']
 
             for p in group['params']:
-                if p.grad is not None: 
+                if p.grad is not None:
                     if p.grad.is_sparse:
                         raise RuntimeError('Adadelta does not support sparse gradients')
 
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
index d539e865d47..c4a111f396a 100644
--- a/torch/optim/_multi_tensor/adam.py
+++ b/torch/optim/_multi_tensor/adam.py
@@ -107,8 +107,8 @@ class Adam(Optimizer):
 
             beta1, beta2 = group['betas']
 
-            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
-            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states]
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states]
             if group['weight_decay'] != 0:
                 grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay'])
 
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
index 3670c786b68..dbbeb58ea73 100644
--- a/torch/optim/_multi_tensor/adamw.py
+++ b/torch/optim/_multi_tensor/adamw.py
@@ -110,8 +110,8 @@ class AdamW(Optimizer):
 
             beta1, beta2 = group['betas']
 
-            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
-            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states]
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states]
 
             #
             # Decay the first and second moment running average coefficient
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
index d2a3eca755d..8c30c45ba29 100644
--- a/torch/optim/_multi_tensor/rprop.py
+++ b/torch/optim/_multi_tensor/rprop.py
@@ -81,7 +81,7 @@ class Rprop(Optimizer):
 
             # for dir<0, dfdx=0
             # for dir>=0 dfdx=dfdx
-            for i in range(len(grads)): 
+            for i in range(len(grads)):
                 grads[i] = grads[i].clone(memory_format=torch.preserve_format)
                 grads[i][signs[i].eq(etaminus)] = 0
 
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
index a1f5772871f..fb43025cde2 100644
--- a/torch/optim/_multi_tensor/sgd.py
+++ b/torch/optim/_multi_tensor/sgd.py
@@ -37,7 +37,7 @@ class SGD(Optimizer):
                 p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
             \end{aligned}
 
-        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the
         parameters, gradient, velocity, and momentum respectively.
 
         This is in contrast to Sutskever et. al. and
@@ -105,7 +105,7 @@ class SGD(Optimizer):
                     if p.grad.is_sparse:
                         has_sparse_grad = True
 
-                        if momentum != 0: 
+                        if momentum != 0:
                             raise RuntimeError('SGD does not support momentum for sparse gradients')
 
             if grads == []:
@@ -148,7 +148,7 @@ class SGD(Optimizer):
                 torch._foreach_add_(params_with_grad, grads, alpha=-group['lr'])
             else:
                 # foreach APIs dont support sparse
-                for i in range(len(params_with_grad)): 
+                for i in range(len(params_with_grad)):
                     params_with_grad[i].add_(grads[i], alpha=-group['lr'])
 
         return loss
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 2e2e425b602..772f636141f 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -38,7 +38,7 @@ class SGD(Optimizer):
                 p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
             \end{aligned}
 
-        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the
         parameters, gradient, velocity, and momentum respectively.
 
         This is in contrast to Sutskever et. al. and
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index adcb88aa159..4c28c5ab1a0 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -14,23 +14,23 @@ class AveragedModel(Module):
     (UAI 2018).
 
     AveragedModel class creates a copy of the provided module :attr:`model`
-    on the device :attr:`device` and allows to compute running averages of the 
+    on the device :attr:`device` and allows to compute running averages of the
     parameters of the :attr:`model`.
 
     Args:
         model (torch.nn.Module): model to use with SWA
         device (torch.device, optional): if provided, the averaged model will be
-            stored on the :attr:`device` 
-        avg_fn (function, optional): the averaging function used to update 
-            parameters; the function must take in the current value of the 
+            stored on the :attr:`device`
+        avg_fn (function, optional): the averaging function used to update
+            parameters; the function must take in the current value of the
             :class:`AveragedModel` parameter, the current value of :attr:`model`
-            parameter and the number of models already averaged; if None, 
+            parameter and the number of models already averaged; if None,
             equally weighted average is used (default: None)
 
     Example:
         >>> loader, optimizer, model, loss_fn = ...
         >>> swa_model = torch.optim.swa_utils.AveragedModel(model)
-        >>> scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
+        >>> scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
         >>>                                     T_max=300)
         >>> swa_start = 160
         >>> swa_scheduler = SWALR(optimizer, swa_lr=0.05)
@@ -46,7 +46,7 @@ class AveragedModel(Module):
         >>>          scheduler.step()
         >>>
         >>> # Update bn statistics for the swa_model at the end
-        >>> torch.optim.swa_utils.update_bn(loader, swa_model) 
+        >>> torch.optim.swa_utils.update_bn(loader, swa_model)
 
     You can also use custom averaging functions with `avg_fn` parameter.
     If no averaging function is provided, the default is to compute
@@ -59,7 +59,7 @@ class AveragedModel(Module):
         >>> swa_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=ema_avg)
 
     .. note::
-        When using SWA with models containing Batch Normalization you may 
+        When using SWA with models containing Batch Normalization you may
         need to update the activation statistics for Batch Normalization.
         You can do so by using :meth:`torch.optim.swa_utils.update_bn` utility.
 
@@ -67,7 +67,7 @@ class AveragedModel(Module):
         :attr:`avg_fn` is not saved in the :meth:`state_dict` of the model.
 
     .. note::
-        When :meth:`update_parameters` is called for the first time (i.e. 
+        When :meth:`update_parameters` is called for the first time (i.e.
         :attr:`n_averaged` is `0`) the parameters of `model` are copied
         to the parameters of :class:`AveragedModel`. For every subsequent
         call of :meth:`update_parameters` the function `avg_fn` is used
@@ -80,7 +80,7 @@ class AveragedModel(Module):
         https://arxiv.org/abs/1806.05594
     .. _SWALP: Stochastic Weight Averaging in Low-Precision Training:
         https://arxiv.org/abs/1904.11943
-    .. _Stochastic Weight Averaging in Parallel: Large-Batch Training That 
+    .. _Stochastic Weight Averaging in Parallel: Large-Batch Training That
         Generalizes Well:
         https://arxiv.org/abs/2001.02312
     """
@@ -130,12 +130,12 @@ def update_bn(loader, model, device=None):
 
     Example:
         >>> loader, model = ...
-        >>> torch.optim.swa_utils.update_bn(loader, model) 
+        >>> torch.optim.swa_utils.update_bn(loader, model)
 
     .. note::
         The `update_bn` utility assumes that each data batch in :attr:`loader`
-        is either a tensor or a list or tuple of tensors; in the latter case it 
-        is assumed that :meth:`model.forward()` should be called on the first 
+        is either a tensor or a list or tuple of tensors; in the latter case it
+        is assumed that :meth:`model.forward()` should be called on the first
         element of the list or tuple corresponding to the data batch.
     """
     momenta = {}
@@ -170,30 +170,30 @@ def update_bn(loader, model, device=None):
 class SWALR(_LRScheduler):
     r"""Anneals the learning rate in each parameter group to a fixed value.
 
-    This learning rate scheduler is meant to be used with Stochastic Weight 
+    This learning rate scheduler is meant to be used with Stochastic Weight
     Averaging (SWA) method (see `torch.optim.swa_utils.AveragedModel`).
 
     Args:
         optimizer (torch.optim.Optimizer): wrapped optimizer
         swa_lrs (float or list): the learning rate value for all param groups
             together or separately for each group.
-        annealing_epochs (int): number of epochs in the annealing phase 
+        annealing_epochs (int): number of epochs in the annealing phase
             (default: 10)
-        annealing_strategy (str): "cos" or "linear"; specifies the annealing 
+        annealing_strategy (str): "cos" or "linear"; specifies the annealing
             strategy: "cos" for cosine annealing, "linear" for linear annealing
             (default: "cos")
         last_epoch (int): the index of the last epoch (default: 'cos')
 
     The :class:`SWALR` scheduler is can be used together with other
-    schedulers to switch to a constant learning rate late in the training 
+    schedulers to switch to a constant learning rate late in the training
     as in the example below.
 
     Example:
         >>> loader, optimizer, model = ...
         >>> lr_lambda = lambda epoch: 0.9
-        >>> scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, 
+        >>> scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer,
         >>>        lr_lambda=lr_lambda)
-        >>> swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, 
+        >>> swa_scheduler = torch.optim.swa_utils.SWALR(optimizer,
         >>>        anneal_strategy="linear", anneal_epochs=20, swa_lr=0.05)
         >>> swa_start = 160
         >>> for i in range(300):
@@ -222,7 +222,7 @@ class SWALR(_LRScheduler):
             self.anneal_func = self._linear_anneal
         if not isinstance(anneal_epochs, int) or anneal_epochs < 0:
             raise ValueError("anneal_epochs must be equal or greater than 0, got {}".format(
-                             anneal_epochs)) 
+                             anneal_epochs))
         self.anneal_epochs = anneal_epochs
 
         super(SWALR, self).__init__(optimizer, last_epoch)
@@ -266,5 +266,5 @@ class SWALR(_LRScheduler):
                     for group in self.optimizer.param_groups]
         t = max(0, min(1, step / max(1, self.anneal_epochs)))
         alpha = self.anneal_func(t)
-        return [group['swa_lr'] * alpha + lr * (1 - alpha) 
+        return [group['swa_lr'] * alpha + lr * (1 - alpha)
                 for group, lr in zip(self.optimizer.param_groups, prev_lrs)]
diff --git a/torch/optim/swa_utils.pyi b/torch/optim/swa_utils.pyi
index d3500c86840..3a6fd451608 100644
--- a/torch/optim/swa_utils.pyi
+++ b/torch/optim/swa_utils.pyi
@@ -5,7 +5,7 @@ from .. import device, Tensor
 from typing import Iterable, Any, Optional, Callable, Union, List
 
 class AveragedModel(Module):
-    def __init__(self, model: Module, device: Union[int, device]=..., 
+    def __init__(self, model: Module, device: Union[int, device]=...,
                  avg_fn: Callable[[Tensor, Tensor, int], Tensor]=...) -> None:...
 
     def update_parameters(self, model: Module) -> None:...
@@ -13,5 +13,5 @@ class AveragedModel(Module):
 def update_bn(loader: Iterable, model: Module, device: Union[int, device]=...) -> None:...
 
 class SWALR(_LRScheduler):
-    def __init__(self, optimizer: Optimizer, swa_lr: float, anneal_epochs: int, 
+    def __init__(self, optimizer: Optimizer, swa_lr: float, anneal_epochs: int,
                  anneal_strategy: str, last_epoch: int=...) -> None:...
diff --git a/torch/package/_importlib.py b/torch/package/_importlib.py
index 1b521ca1a96..0d183ec27be 100644
--- a/torch/package/_importlib.py
+++ b/torch/package/_importlib.py
@@ -1,6 +1,6 @@
 import _warnings
 import os.path
-# note: implementations 
+# note: implementations
 # copied from cpython's import code
 
 
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 9ed1b0dc02a..c3f3318459e 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -53,10 +53,10 @@ def mm(mat1: Tensor, mat2: Tensor) -> Tensor:
 
     Args:
         mat1 (SparseTensor): the first sparse matrix to be multiplied
-        mat2 (Tensor): the second matrix to be multiplied, which could be sparse or dense 
+        mat2 (Tensor): the second matrix to be multiplied, which could be sparse or dense
 
     Shape:
-        The format of the output tensor of this function follows: 
+        The format of the output tensor of this function follows:
         - sparse x sparse -> sparse
         - sparse x dense -> dense
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index d3af775d228..b7acff8cfec 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -66,7 +66,7 @@ def tf32_off():
     try:
         torch.backends.cuda.matmul.allow_tf32 = False
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=False):
-            yield 
+            yield
     finally:
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
 
@@ -79,7 +79,7 @@ def tf32_on(self, tf32_precision=1e-5):
         torch.backends.cuda.matmul.allow_tf32 = True
         self.precision = tf32_precision
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
-            yield 
+            yield
     finally:
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
         self.precision = old_precison
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index a93e13b665b..0241a6ed4bb 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -143,15 +143,15 @@ class JitCommonTestCase(TestCase):
             torch.jit.save(imported, fname)
             return torch.jit.load(fname, map_location=map_location)
 
-    def autoDiffErrorMessage(self, should_autodiff_node, nodes_not_in_diff_graph, 
-                             fusion_nodes_not_found, non_fusible_nodes_being_fused, 
+    def autoDiffErrorMessage(self, should_autodiff_node, nodes_not_in_diff_graph,
+                             fusion_nodes_not_found, non_fusible_nodes_being_fused,
                              fusion_nodes_found, nodes_in_diff_graph):
         err_msg = "\nFailure in testing nodes' autodifferentiation. "
         if should_autodiff_node:
             err_msg += "One or more nodes were expected to be autodiffed, " \
                 "but were not found in specified fusible/nonfusible " \
                 "DifferentiableGraph groups. \nSpecifically:"
-            # The node is intended to appear in a differentiable graph but doesn't 
+            # The node is intended to appear in a differentiable graph but doesn't
             diff_nodes_missing = []
             # The node is intended to appear in a differentiable graph
             # outside of a fusion group but instead is in a fusion group
@@ -196,7 +196,7 @@ class JitCommonTestCase(TestCase):
                     "Did you intend for these nodes to be fused? If not, you should " \
                     "move these nodes into the test's nonfusible nodes. Otherwise your " \
                     "autodifferentiation logic might be wrong."
-        else: 
+        else:
             err_msg += "One or more nodes were not expected to be autodiffed " \
                 "but were found in a DifferentiableGraph or in a FusionGroup " \
                 "of a DifferentiableGraph. Did you intend for these nodes to be " \
@@ -226,7 +226,7 @@ class JitCommonTestCase(TestCase):
         for node in nonfusible_nodes:
             if any(g.findNode(node) is not None for g in diff_subgraphs):
                 nodes_in_diff_graph.append(node)
-            else: 
+            else:
                 nodes_not_in_diff_graph.append(node)
             if any(g.findNode(node) is not None for g in fusion_subgraphs):
                 non_fusible_nodes_being_fused.append(node)
@@ -239,14 +239,14 @@ class JitCommonTestCase(TestCase):
             if any(g.findNode(node) is not None for g in fusion_subgraphs):
                 fusion_nodes_found.append(node)
             else:
-                fusion_nodes_not_found.append(node) 
-        found_all_fusible_nodes = len(fusion_nodes_found) == len(fusible_nodes)    
+                fusion_nodes_not_found.append(node)
+        found_all_fusible_nodes = len(fusion_nodes_found) == len(fusible_nodes)
 
-        err_msg = self.autoDiffErrorMessage(should_autodiff_node, 
-                                            nodes_not_in_diff_graph, 
-                                            fusion_nodes_not_found, 
+        err_msg = self.autoDiffErrorMessage(should_autodiff_node,
+                                            nodes_not_in_diff_graph,
+                                            fusion_nodes_not_found,
                                             non_fusible_nodes_being_fused,
-                                            fusion_nodes_found, 
+                                            fusion_nodes_found,
                                             nodes_in_diff_graph)
-        self.assertEqual(should_autodiff_node, 
-                         found_all_nonfusible_nodes and found_all_fusible_nodes, err_msg)  
+        self.assertEqual(should_autodiff_node,
+                         found_all_nonfusible_nodes and found_all_fusible_nodes, err_msg)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index f1c217b7c30..d0b98011f23 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1526,7 +1526,7 @@ def retry(ExceptionToCheck, tries=3, delay=3, skip_after_retries=False):
 # Methods for matrix and tensor generation
 
 # Used in test_autograd.py and test_torch.py
-def make_tensor(size, device: torch.device, dtype: torch.dtype, *, low=None, high=None, 
+def make_tensor(size, device: torch.device, dtype: torch.dtype, *, low=None, high=None,
                 requires_grad: bool = False, discontiguous: bool = False) -> torch.Tensor:
     """ Creates a random tensor with the given size, device and dtype.
 
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index f38bab08695..9e2fe2a8859 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -296,7 +296,7 @@ def gen_script_fn_and_args(method_name, func_type, *args, **kwargs):
 # returns a function takes in (args, kwargs) and runs the compiled function
 def create_script_fn(self, method_name, func_type):
     # function returns tuple containing original output and
-    # filtered output to be used in checking gradients 
+    # filtered output to be used in checking gradients
     def script_fn(*args, **kwargs):
         fn, tensors = gen_script_fn_and_args(method_name, func_type, *args, **kwargs)
         self.assertExportImport(fn.graph, tensors)
diff --git a/torch/testing/_internal/print_test_stats.py b/torch/testing/_internal/print_test_stats.py
index 50e0345205a..c3222ebd238 100755
--- a/torch/testing/_internal/print_test_stats.py
+++ b/torch/testing/_internal/print_test_stats.py
@@ -708,7 +708,7 @@ class TestFile:
         if suite_name not in self.test_suites:
             self.test_suites[suite_name] = TestSuite(suite_name)
         if test_case.name in self.test_suites[suite_name].test_cases:
-            # We expect duplicate tests for test_cpp_extensions_aot, distributed/test_distributed_fork, 
+            # We expect duplicate tests for test_cpp_extensions_aot, distributed/test_distributed_fork,
             # and distributed/test_distributed_spawn. In these cases, we store the test case that took the longest,
             # as in these jobs, the duplicate tests are run in parallel.
             # For other unexpected cases, we should raise a warning.
@@ -718,7 +718,7 @@ class TestFile:
                self.name == 'cpp':  # The caffe2 cpp tests spawn duplicate test cases as well.
                 time_difference = self.test_suites[suite_name].replace(test_case)
                 self.total_time += time_difference
-            else: 
+            else:
                 raise RuntimeWarning(f'Duplicate test case {test_case.name} in suite {suite_name} called from {self.name}')
         else:
             self.test_suites[suite_name].append(test_case)
@@ -818,8 +818,8 @@ def assemble_s3_object(
                         'cases': {
                             name: {
                                 'seconds': case.time,
-                                'status': 'skipped' if case.skipped else 
-                                          'errored' if case.errored else 
+                                'status': 'skipped' if case.skipped else
+                                          'errored' if case.errored else
                                           'failed' if case.failed else None
                             }
                             for name, case in suite.test_cases.items()
diff --git a/torch/utils/benchmark/utils/sparse_fuzzer.py b/torch/utils/benchmark/utils/sparse_fuzzer.py
index fa984e41c9c..1b2e884ce95 100644
--- a/torch/utils/benchmark/utils/sparse_fuzzer.py
+++ b/torch/utils/benchmark/utils/sparse_fuzzer.py
@@ -2,7 +2,7 @@ from typing import Optional, Tuple, Union
 from numbers import Number
 import torch
 from torch.utils.benchmark import FuzzedTensor
-import math  
+import math
 
 class FuzzedSparseTensor(FuzzedTensor):
     def __init__(
@@ -36,13 +36,13 @@ class FuzzedSparseTensor(FuzzedTensor):
                 The length of `size` will be truncated to this value.
                 This allows Tensors of varying dimensions to be generated by the
                 Fuzzer.
-            sparse_dim: 
+            sparse_dim:
                 The number of sparse dimensions in a sparse tensor.
-            density: 
-                This value allows tensors of varying sparsities to be generated by the Fuzzer.  
-            coalesced: 
+            density:
+                This value allows tensors of varying sparsities to be generated by the Fuzzer.
+            coalesced:
                 The sparse tensor format permits uncoalesced sparse tensors,
-                where there may be duplicate coordinates in the indices. 
+                where there may be duplicate coordinates in the indices.
             dtype:
                 The PyTorch dtype of the generated Tensor.
             cuda:
@@ -60,11 +60,11 @@ class FuzzedSparseTensor(FuzzedTensor):
 
         Note that when `is_coalesced` is False, the number of elements is doubled but the number of indices
         represents the same amount of number of non zeros `nnz`, i.e, this is virtually the same tensor
-        with the same sparsity pattern. Moreover, most of the sparse operation will use coalesce() method 
-        and what we want here is to get a sparse tensor with the same `nnz` even if this is coalesced or not. 
+        with the same sparsity pattern. Moreover, most of the sparse operation will use coalesce() method
+        and what we want here is to get a sparse tensor with the same `nnz` even if this is coalesced or not.
 
-        In the other hand when `is_coalesced` is True the number of elements is reduced in the coalescing process 
-        by an unclear amount however the probability to generate duplicates indices are low for most of the cases. 
+        In the other hand when `is_coalesced` is True the number of elements is reduced in the coalescing process
+        by an unclear amount however the probability to generate duplicates indices are low for most of the cases.
         This decision was taken on purpose to maintain the construction cost as low as possible.
         """
         if isinstance(size, Number):
@@ -80,14 +80,14 @@ class FuzzedSparseTensor(FuzzedTensor):
         i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
         i = i.to(torch.long)
 
-        if not is_coalesced: 
+        if not is_coalesced:
             v = torch.cat([v, torch.randn_like(v)], 0)
             i = torch.cat([i, i], 1)
 
         x = torch.sparse_coo_tensor(i, v, torch.Size(size))
         if is_coalesced:
             x = x.coalesce()
-        return x 
+        return x
 
     def _make_tensor(self, params, state):
         size, _, _ = self._get_size_and_steps(params)
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 9b7bab90cce..6a5854aec95 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -7773,7 +7773,7 @@ CUDA_SPARSE_MAP = collections.OrderedDict(
         ),
         ("cusparseCreateCsrgemm2Info", ("hipsparseCreateCsrgemm2Info", CONV_MATH_FUNC, API_SPARSE)),
         (
-            "cusparseDestroyCsrgemm2Info", 
+            "cusparseDestroyCsrgemm2Info",
             ("hipsparseDestroyCsrgemm2Info", CONV_MATH_FUNC, API_SPARSE),
         ),
         ("cusparseXcsrgemm2Nnz", ("hipsparseXcsrgemm2Nnz", CONV_MATH_FUNC, API_SPARSE)),