From b9cb1abf656cc145cebb30e47bf64fe507dd51ea Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Fri, 2 Aug 2024 13:46:44 +0000
Subject: [PATCH]  [12/N] Use  std::optional  (#132361)

Follows #132396

Pull Request resolved: https://github.com/pytorch/pytorch/pull/132361
Approved by: https://github.com/eqy
---
 aten/src/ATen/core/NamedTensor.cpp            |   2 +-
 .../impl/make_boxed_from_unboxed_functor.h    |   2 +-
 aten/src/ATen/core/op_registration/adaption.h |   8 +-
 .../ATen/functorch/BatchRulesActivation.cpp   |   4 +-
 .../ATen/functorch/BatchRulesBinaryOps.cpp    |  14 +--
 .../ATen/functorch/BatchRulesConvolution.cpp  |   6 +-
 aten/src/ATen/functorch/BatchRulesFactory.cpp |  20 +--
 aten/src/ATen/functorch/BatchRulesHelper.h    |  10 +-
 aten/src/ATen/functorch/BatchRulesLoss.cpp    |   8 +-
 aten/src/ATen/functorch/BatchRulesModules.cpp |  10 +-
 aten/src/ATen/functorch/BatchRulesNorm.cpp    |  16 +--
 aten/src/ATen/functorch/BatchRulesPooling.cpp |   6 +-
 .../ATen/functorch/BatchRulesReduceOps.cpp    |   6 +-
 .../ATen/functorch/BatchRulesScatterOps.cpp   | 116 +++++++++---------
 .../src/ATen/functorch/BatchRulesUnaryOps.cpp |   4 +-
 aten/src/ATen/functorch/BatchRulesViews.cpp   |  26 ++--
 .../ATen/functorch/BatchingMetaprogramming.h  |   4 +-
 aten/src/ATen/functorch/DynamicLayer.cpp      |   2 +-
 aten/src/ATen/functorch/PlumbingHelper.cpp    |   2 +-
 aten/src/ATen/functorch/PlumbingHelper.h      |   2 +-
 aten/src/ATen/native/LinearAlgebra.cpp        |   2 +-
 aten/src/ATen/native/TensorConversions.cpp    |   2 +-
 .../native/transformers/cuda/attention.cu     |   2 +-
 .../cuda/flash_attn/flash_api.cpp             |   2 +-
 .../transformers/cuda/flash_attn/flash_api.h  |   2 +-
 c10/core/DeviceGuard.h                        |   2 +-
 c10/core/StreamGuard.h                        |   2 +-
 c10/core/impl/InlineDeviceGuard.h             |   5 +-
 c10/core/impl/InlineStreamGuard.h             |   3 +-
 c10/cuda/CUDAGuard.h                          |   6 +-
 c10/util/Optional.h                           |   2 +-
 .../api/include/torch/data/dataloader/base.h  |   4 +-
 .../include/torch/data/dataloader_options.h   |   4 +-
 .../api/include/torch/data/datasets/base.h    |   5 +-
 .../api/include/torch/data/detail/queue.h     |   2 +-
 .../include/torch/data/detail/sequencers.h    |   6 +-
 torch/csrc/api/include/torch/data/iterator.h  |   2 +-
 .../api/include/torch/data/samplers/base.h    |   2 +-
 .../include/torch/data/samplers/distributed.h |   4 +-
 .../api/include/torch/data/samplers/random.h  |   2 +-
 .../include/torch/data/samplers/sequential.h  |   2 +-
 .../api/include/torch/data/samplers/stream.h  |   2 +-
 .../include/torch/nn/modules/container/any.h  |   4 +-
 torch/csrc/api/src/data/datasets/mnist.cpp    |   3 +-
 .../api/src/data/samplers/distributed.cpp     |   9 +-
 torch/csrc/api/src/data/samplers/random.cpp   |   2 +-
 .../csrc/api/src/data/samplers/sequential.cpp |   4 +-
 torch/csrc/api/src/data/samplers/stream.cpp   |   4 +-
 torch/csrc/autograd/python_engine.cpp         |   2 +-
 torch/csrc/distributed/c10d/Store.hpp         |   2 +-
 .../inductor/aoti_eager/kernel_holder.cpp     |   2 +-
 torch/csrc/jit/mobile/train/random.cpp        |   4 +-
 torch/csrc/jit/mobile/train/random.h          |   2 +-
 torch/csrc/jit/mobile/train/sequential.cpp    |   2 +-
 torch/csrc/jit/mobile/train/sequential.h      |   2 +-
 torch/csrc/profiler/unwind/debug_info.h       |   2 +-
 .../profiler/unwind/dwarf_symbolize_enums.h   |   4 +-
 .../profiler/unwind/line_number_program.h     |   2 +-
 torch/csrc/profiler/unwind/range_table.h      |   9 +-
 torch/csrc/profiler/unwind/sections.h         |   2 +-
 torch/csrc/profiler/unwind/unwind_error.h     |   2 -
 torchgen/executorch/api/unboxing.py           |  12 +-
 torchgen/gen_vmap_plumbing.py                 |   4 +-
 63 files changed, 204 insertions(+), 208 deletions(-)

diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
index 776a4a55460..b1126e21226 100644
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@@ -127,7 +127,7 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
   }
 }
 
-optional<DimnameList> get_opt_names(const TensorImpl* impl) {
+std::optional<DimnameList> get_opt_names(const TensorImpl* impl) {
   const auto* meta = get_named_tensor_meta(impl);
   if (meta == nullptr) {
     return std::nullopt;
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index c771dd548ea..729691c1cd8 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -392,7 +392,7 @@ namespace impl {
     }
   };
   template<class T, bool AllowDeprecatedTypes>
-  struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
+  struct ivalue_to_arg<std::optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
     // If an argument is std::optional<ArrayRef<T>>, convert the IValue to an std::optional<std::vector<T>> and pass that
     // to the operator. OptionalArray<T> is basically a std::optional<std::vector<T>> but implicitly convertible
     // to std::optional<ArrayRef<T>>.
diff --git a/aten/src/ATen/core/op_registration/adaption.h b/aten/src/ATen/core/op_registration/adaption.h
index 66a5c36ba65..035a816f84e 100644
--- a/aten/src/ATen/core/op_registration/adaption.h
+++ b/aten/src/ATen/core/op_registration/adaption.h
@@ -45,7 +45,7 @@ namespace impl {
 
 TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName);
 
-inline void check_and_update_common_device(optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
   // TODO: Remove this once the following issue is addressed:
   // https://github.com/pytorch/pytorch/issues/57380
   if (!tensor.defined()) {
@@ -62,19 +62,19 @@ inline void check_and_update_common_device(optional<Device>& common_device, cons
   }
 }
 
-inline void check_and_update_common_device(optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
   if (tensor.has_value()) {
     check_and_update_common_device(common_device, tensor.value(), methodName, argName);
   }
 }
 
-inline void check_and_update_common_device(optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
   for (const auto& tensor : tensors) {
     check_and_update_common_device(common_device, tensor, methodName, argName);
   }
 }
 
-inline void check_and_update_common_device(optional<Device>& common_device, const List<optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, const List<std::optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
   for (const auto& tensor : tensors) {
     check_and_update_common_device(common_device, tensor, methodName, argName);
   }
diff --git a/aten/src/ATen/functorch/BatchRulesActivation.cpp b/aten/src/ATen/functorch/BatchRulesActivation.cpp
index ce71e4459e5..dbcc6738040 100644
--- a/aten/src/ATen/functorch/BatchRulesActivation.cpp
+++ b/aten/src/ATen/functorch/BatchRulesActivation.cpp
@@ -11,7 +11,7 @@
 // NB: most activation functions fit pointwise unary or binary rules.
 // These are only the ones that have special batch rules to help with organization
 namespace at::functorch {
-static std::tuple<Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 glu_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim) {
   // repeated error message from glu because 0D -> 1D when batched
   // this can't pass anyway because a 0-dimensional tensor has "size" 1, which
@@ -27,7 +27,7 @@ glu_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim
   return std::make_tuple(res, 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> glu_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> glu_backward_batch_rule(
     const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
     const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim) {
   if (self_bdim) {
diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
index 0d562b55fe8..0580be792a3 100644
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@@ -14,7 +14,7 @@
 namespace at::functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
-std::tuple<Tensor,optional<int64_t>> _binary_pointwise_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> _binary_pointwise_batch_rule(
     const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
     const Tensor& other, std::optional<int64_t> other_batch_dim,
     ExtraArgs... extra_args) {
@@ -33,7 +33,7 @@ struct BinaryPointwiseBatchRuleHelper;
 
 template <typename F, F Func, typename T1, typename T2, typename... T>
 struct BinaryPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
       const Tensor& other, std::optional<int64_t> other_batch_dim,
       T... extra_args) {
@@ -120,7 +120,7 @@ void binary_pointwise_inplace_batch_rule(
 }
 
 template <typename F, F Func>
-std::tuple<Tensor,optional<int64_t>> comparison_pointwise_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> comparison_pointwise_batch_rule(
     const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
     const Tensor& other, std::optional<int64_t> other_batch_dim) {
   // compute max logical rank
@@ -142,7 +142,7 @@ std::tuple<Tensor,optional<int64_t>> comparison_pointwise_batch_rule(
   return std::make_tuple( std::move(result), 0 );
 }
 
-static std::tuple<Tensor,optional<int64_t>> where_self_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> where_self_batch_rule(
     const Tensor& condition, std::optional<int64_t> condition_bdim,
     const Tensor& self, std::optional<int64_t> self_bdim, const Tensor& other, std::optional<int64_t> other_bdim) {
   auto condition_logical_rank = rankWithoutBatchDim(condition, condition_bdim);
@@ -177,7 +177,7 @@ static std::tuple<Tensor, std::optional<int64_t>> gelu_backward_batch_rule(
   return std::make_tuple(at::gelu_backward(grad_out_, input_, approximate), 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> masked_select_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> masked_select_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& mask, std::optional<int64_t> mask_bdim) {
   TORCH_CHECK(!mask_bdim.has_value(),
@@ -196,7 +196,7 @@ static std::tuple<Tensor,optional<int64_t>> masked_select_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> masked_select_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> masked_select_backward_batch_rule(
     const Tensor& grad, std::optional<int64_t> grad_bdim,
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& mask, std::optional<int64_t> mask_bdim) {
@@ -221,7 +221,7 @@ static std::tuple<Tensor,optional<int64_t>> masked_select_backward_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> cdist_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> cdist_backward_batch_rule(
     const Tensor& grad, std::optional<int64_t> grad_bdim,
     const Tensor& x1, std::optional<int64_t> x1_bdim,
     const Tensor& x2, std::optional<int64_t> x2_bdim,
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index d62fa092fd3..c3db153cdcd 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -16,7 +16,7 @@ namespace at::functorch {
 // PyTorch's convolution is different from JAX's conv_general_dilated:
 // we do not support batch_group_count (which is needed for convolution backwards).
 // Instead, there's a convolution_backward op that needs a batching rule.
-static std::tuple<Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 convolution_batch_rule(const Tensor& lhs, std::optional<int64_t> lhs_bdim, const Tensor& rhs, std::optional<int64_t> rhs_bdim, const std::optional<Tensor>& bias, std::optional<int64_t> bias_bdim, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) {
   DimVector lhs_spec(stride.size() + 2);
   std::iota(lhs_spec.begin(), lhs_spec.end(), 0);
@@ -239,7 +239,7 @@ static Tensor make_dummy(
   return tensor_.new_empty({}).expand(expand_shape);
 }
 
-static std::tuple<Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 convolution_backward_input_batch_rule(
     const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
     const Tensor& input, std::optional<int64_t> input_bdim,
@@ -320,7 +320,7 @@ convolution_backward_input_batch_rule(
     return std::make_tuple(std::get<0>(result), std::nullopt);
   }
 }
-static std::tuple<Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 convolution_backward_weight_batch_rule(
     const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
     const Tensor& input, std::optional<int64_t> input_bdim,
diff --git a/aten/src/ATen/functorch/BatchRulesFactory.cpp b/aten/src/ATen/functorch/BatchRulesFactory.cpp
index 5f012712e47..73fa5bc20cd 100644
--- a/aten/src/ATen/functorch/BatchRulesFactory.cpp
+++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp
@@ -14,7 +14,7 @@ struct NewBlahBatchRuleHelperSymInt;
 
 template <typename F, F Func, typename A, typename B, typename... T>
 struct NewBlahBatchRuleHelperSymInt<F, Func, typelist<A, B, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& tensor,
       std::optional<int64_t> batch_dim,
       SymIntArrayRef shape,
@@ -33,7 +33,7 @@ struct NewBlahBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename B, typename... T>
 struct NewBlahBatchRuleHelper<F, Func, typelist<A, B, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& tensor,
       std::optional<int64_t> batch_dim,
       IntArrayRef shape,
@@ -62,7 +62,7 @@ struct NewBlahBatchRuleHelper<F, Func, typelist<A, B, T...>> {
       &fn,\
       c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
 
-static std::tuple<Tensor,optional<int64_t>> _new_zeros_with_same_feature_meta_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> _new_zeros_with_same_feature_meta_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim,
     int64_t self_num_batch_dims) {
@@ -103,7 +103,7 @@ static std::tuple<Tensor,optional<int64_t>> _new_zeros_with_same_feature_meta_ba
   return std::make_tuple(result, 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> linspace_logspace_batch_rule_helper(
+static std::tuple<Tensor, std::optional<int64_t>> linspace_logspace_batch_rule_helper(
     const at::Tensor& start, std::optional<int64_t> start_bdim,
     const at::Tensor& end, std::optional<int64_t> end_bdim,
     int64_t steps,
@@ -141,7 +141,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_logspace_batch_rule_helper(
   return std::make_tuple(result, 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Tensor_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> linspace_Tensor_Tensor_batch_rule(
     const at::Tensor& start, std::optional<int64_t> start_bdim,
     const at::Tensor& end, std::optional<int64_t> end_bdim,
     int64_t steps,
@@ -152,7 +152,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Tensor_batch_rule(
   return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, std::nullopt, dtype, layout, device, pin_memory);
 }
 
-static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Scalar_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> linspace_Tensor_Scalar_batch_rule(
     const at::Tensor& start, std::optional<int64_t> start_bdim,
     const at::Scalar& end,
     int64_t steps,
@@ -165,7 +165,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Scalar_batch_rule(
   return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, std::nullopt, steps, std::nullopt, dtype, layout, device, pin_memory);
 }
 
-static std::tuple<Tensor,optional<int64_t>> linspace_Scalar_Tensor_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> linspace_Scalar_Tensor_batch_rule(
     const at::Scalar& start,
     const at::Tensor& end, std::optional<int64_t> end_bdim,
     int64_t steps,
@@ -178,7 +178,7 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Scalar_Tensor_batch_rule(
   return linspace_logspace_batch_rule_helper(start_t, std::nullopt, end, end_bdim, steps, std::nullopt, dtype, layout, device, pin_memory);
 }
 
-static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Tensor_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> logspace_Tensor_Tensor_batch_rule(
     const at::Tensor& start, std::optional<int64_t> start_bdim,
     const at::Tensor& end, std::optional<int64_t> end_bdim,
     int64_t steps,
@@ -190,7 +190,7 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Tensor_batch_rule(
   return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, std::make_optional(base), dtype, layout, device, pin_memory);
 }
 
-static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Scalar_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> logspace_Tensor_Scalar_batch_rule(
     const at::Tensor& start, std::optional<int64_t> start_bdim,
     const at::Scalar& end,
     int64_t steps,
@@ -204,7 +204,7 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Scalar_batch_rule(
   return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, std::nullopt, steps, std::make_optional(base), dtype, layout, device, pin_memory);
 }
 
-static std::tuple<Tensor,optional<int64_t>> logspace_Scalar_Tensor_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> logspace_Scalar_Tensor_batch_rule(
     const at::Scalar& start,
     const at::Tensor& end, std::optional<int64_t> end_bdim,
     int64_t steps,
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 30b3e8154fd..f95e5c8d66d 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -33,7 +33,7 @@ TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1,
 Tensor moveBatchDimToFront(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
 int64_t rankWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
 int64_t numelWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
-optional<int64_t> valIfNonempty(optional<int64_t> maybe_empty, int64_t new_val);
+std::optional<int64_t> valIfNonempty(std::optional<int64_t> maybe_empty, int64_t new_val);
 int64_t getPhysicalDim(const Tensor& tensor, bool has_batch_dim, int64_t logical_dim);
 VmapDimVector getPhysicalDims(const Tensor& tensor, bool has_batch_dim, IntArrayRef logical_dims);
 
@@ -71,7 +71,7 @@ struct BasicUnaryBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename... T>
 struct BasicUnaryBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& tensor,
       std::optional<int64_t> batch_dim,
       T... extra_args) {
@@ -96,7 +96,7 @@ struct VariadicBdimsBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename... T>
 struct VariadicBdimsBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& tensor,
       std::optional<int64_t> batch_dim,
       T... extra_args) {
@@ -201,7 +201,7 @@ inline void handle_variadic_bdims(std::vector<std::pair<Tensor, std::optional<in
 #define VARIADIC_BDIMS_BOXED(op) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
 
-using UnpackedBatchedTensor = std::tuple<Tensor,optional<int64_t>>;
+using UnpackedBatchedTensor = std::tuple<Tensor, std::optional<int64_t>>;
 
 inline void find_and_unpack_tensors(
     const torch::jit::Stack* stack,
@@ -384,7 +384,7 @@ struct ExistingBdimBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename... T>
 struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& self,
       std::optional<int64_t> self_bdim,
       T... extra_args) {
diff --git a/aten/src/ATen/functorch/BatchRulesLoss.cpp b/aten/src/ATen/functorch/BatchRulesLoss.cpp
index b897d78d27a..e920378ab3f 100644
--- a/aten/src/ATen/functorch/BatchRulesLoss.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp
@@ -27,7 +27,7 @@ static at::Tensor flatten_logical(const Tensor& tensor, std::optional<int64_t> b
 
 // Useful for many loss functions
 template <typename Func>
-static std::tuple<at::Tensor,optional<int64_t>>
+static std::tuple<at::Tensor, std::optional<int64_t>>
 loss_batch_rule_helper(const at::Tensor& self, std::optional<int64_t> self_bdim, const at::Tensor& target,
           std::optional<int64_t> target_bdim, int64_t reduction,
           Func loss_fn) {
@@ -49,7 +49,7 @@ loss_batch_rule_helper(const at::Tensor& self, std::optional<int64_t> self_bdim,
   TORCH_INTERNAL_ASSERT(false);
 };
 
-static std::tuple<at::Tensor,optional<int64_t>>
+static std::tuple<at::Tensor, std::optional<int64_t>>
 mse_loss_batch_rule(const at::Tensor& self, std::optional<int64_t> self_bdim, const at::Tensor& target,
           std::optional<int64_t> target_bdim, int64_t reduction) {
   return loss_batch_rule_helper(self, self_bdim, target, target_bdim,
@@ -58,7 +58,7 @@ mse_loss_batch_rule(const at::Tensor& self, std::optional<int64_t> self_bdim, co
                                 });
 };
 
-static std::tuple<at::Tensor,optional<int64_t>>
+static std::tuple<at::Tensor, std::optional<int64_t>>
 huber_loss_batch_rule(const at::Tensor& self, std::optional<int64_t> self_bdim, const at::Tensor& target,
           std::optional<int64_t> target_bdim, int64_t reduction, double delta) {
   return loss_batch_rule_helper(self, self_bdim, target, target_bdim,
@@ -67,7 +67,7 @@ huber_loss_batch_rule(const at::Tensor& self, std::optional<int64_t> self_bdim,
                                 });
 };
 
-static std::tuple<at::Tensor,optional<int64_t>>
+static std::tuple<at::Tensor, std::optional<int64_t>>
 smooth_l1_loss_batch_rule(const at::Tensor& self, std::optional<int64_t> self_bdim, const at::Tensor& target,
           std::optional<int64_t> target_bdim, int64_t reduction, double beta) {
   return loss_batch_rule_helper(self, self_bdim, target, target_bdim,
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 7b0f8bd7c42..99a5a434d54 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -20,7 +20,7 @@ static Tensor getStepTensor(const Tensor& indices, const c10::SymInt& bdim_size,
   return range.view_symint(view_shape);
 }
 
-static std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
     const Tensor& weight, std::optional<int64_t> weight_bdim,
     const Tensor& indices, std::optional<int64_t> indices_bdim,
     c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
@@ -50,7 +50,7 @@ static std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 embedding_dense_backward_batch_rule(
     const Tensor& grad_, std::optional<int64_t> grad_bdim,
     const Tensor& indices_, std::optional<int64_t> indices_bdim,
@@ -109,7 +109,7 @@ embedding_dense_backward_batch_rule(
  *       output: (BN)CD_{out}H_{out}W_{out}
  */
 template<typename F, F Func, typename... ExtraArgs>
-std::tuple<Tensor,optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>>
 grid_sample_batch_rule(const Tensor& input, std::optional<int64_t> input_bdim, const Tensor& grid, std::optional<int64_t> grid_bdim, ExtraArgs... extra_args) {
   std::tuple<Tensor, std::optional<int64_t>> result;
   if (input_bdim && !grid_bdim) {
@@ -256,7 +256,7 @@ struct UpsampleBackwardBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename B, typename C, typename... T>
 struct UpsampleBackwardBatchRuleHelper<F, Func, typelist<A, B, C, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
       c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size,
       T... extra_args) {
@@ -282,7 +282,7 @@ struct GridSampleBatchRuleHelper;
 
 template <typename F, F Func, typename T1, typename T2, typename... T>
 struct GridSampleBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
       const Tensor& input, std::optional<int64_t> input_batch_dim,
       const Tensor& grid, std::optional<int64_t> grid_batch_dim,
       T... extra_args) {
diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp
index ceea3328108..609c87486f2 100644
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@@ -42,7 +42,7 @@ static Tensor padRight(const Tensor& tensor, std::optional<int64_t> has_bdim, in
 }
 
 template<typename F, F Func>
-std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
 batch_norm_batch_rule(
     const Tensor& input, std::optional<int64_t> input_bdim,
     const std::optional<Tensor>& weight_opt, std::optional<int64_t> weight_bdim,
@@ -124,7 +124,7 @@ batch_norm_batch_rule(
 }
 
 template<typename F, F Func>
-std::tuple<at::Tensor,optional<int64_t>> batch_norm_backward_no_weight_bias_batch_rule(
+std::tuple<at::Tensor, std::optional<int64_t>> batch_norm_backward_no_weight_bias_batch_rule(
     const at::Tensor & grad_out, std::optional<int64_t> grad_out_bdim,
     const at::Tensor & input, std::optional<int64_t> input_bdim,
     const std::optional<at::Tensor> & running_mean_opt, std::optional<int64_t> running_mean_bdim,
@@ -337,7 +337,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
   return std::make_tuple(result0, mean, rstd);
 }
 
-static std::tuple<at::Tensor,optional<int64_t>> group_norm_backward_no_weight_bias_batch_rule(
+static std::tuple<at::Tensor, std::optional<int64_t>> group_norm_backward_no_weight_bias_batch_rule(
     const at::Tensor & grad_out, std::optional<int64_t> grad_out_bdim,
     const at::Tensor & input, std::optional<int64_t> input_bdim,
     const at::Tensor & mean, std::optional<int64_t> mean_bdim,
@@ -484,7 +484,7 @@ C10_ALWAYS_INLINE void _check_layer_norm_inputs(
   check_same_shape(bias, bias_bdim, normalized_shape, "weight");
 }
 
-static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
 native_layer_norm_batch_rule(
     const Tensor& input, std::optional<int64_t> input_bdim,
     c10::SymIntArrayRef normalized_shape,
@@ -530,7 +530,7 @@ native_layer_norm_batch_rule(
   return std::make_tuple(result0, 0, mean, stats_bdim, rstd, stats_bdim);
 }
 
-static std::tuple<at::Tensor,optional<int64_t>> native_layer_norm_backward_no_weight_bias_batch_rule(
+static std::tuple<at::Tensor, std::optional<int64_t>> native_layer_norm_backward_no_weight_bias_batch_rule(
     const at::Tensor & grad_out, std::optional<int64_t> grad_out_bdim,
     const at::Tensor & input, std::optional<int64_t> input_bdim,
     at::IntArrayRef normalized_shape,
@@ -651,7 +651,7 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
 
 template <typename F, F Func>
 struct NativeBatchNormBatchRuleHelper {
-  static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>> apply(
     const Tensor& input, std::optional<int64_t> input_bdim,
     const std::optional<Tensor>& weight_opt, std::optional<int64_t> weight_bdim,
     const std::optional<Tensor>& bias_opt, std::optional<int64_t> bias_bdim,
@@ -666,7 +666,7 @@ struct NativeBatchNormBatchRuleHelper {
 
 template <typename F, F Func>
 struct CudnnBatchNormBatchRuleHelper {
-  static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>> apply(
     const Tensor& input, std::optional<int64_t> input_bdim,
     const Tensor& weight_opt, std::optional<int64_t> weight_bdim,
     const std::optional<Tensor>& bias_opt, std::optional<int64_t> bias_bdim,
@@ -683,7 +683,7 @@ struct CudnnBatchNormBatchRuleHelper {
 
 template <typename F, F Func>
 struct MiopenBatchNormBatchRuleHelper {
-  static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>> apply(
     const Tensor& input, std::optional<int64_t> input_bdim,
     const Tensor& weight_opt, std::optional<int64_t> weight_bdim,
     const std::optional<Tensor>& bias_opt, std::optional<int64_t> bias_bdim,
diff --git a/aten/src/ATen/functorch/BatchRulesPooling.cpp b/aten/src/ATen/functorch/BatchRulesPooling.cpp
index 93013f38311..cafd7bbee0e 100644
--- a/aten/src/ATen/functorch/BatchRulesPooling.cpp
+++ b/aten/src/ATen/functorch/BatchRulesPooling.cpp
@@ -12,7 +12,7 @@
 namespace at::functorch {
 
 template <typename Func>
-std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
 max_pool_with_indices_batch_rule_helper(
   const Tensor& self, std::optional<int64_t> self_bdim,
   IntArrayRef kernel_size, IntArrayRef stride,
@@ -37,7 +37,7 @@ max_pool_with_indices_batch_rule_helper(
       reshape_dim_outof(0, bdim_size, std::get<1>(result)), 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
 max_pool3d_with_indices_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     IntArrayRef kernel_size, IntArrayRef stride,
@@ -45,7 +45,7 @@ max_pool3d_with_indices_batch_rule(
     return max_pool_with_indices_batch_rule_helper(self, self_bdim, kernel_size, stride, padding, dilation, ceil_mode, 3, at::max_pool3d_with_indices);
 }
 
-static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
 max_pool2d_with_indices_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     IntArrayRef kernel_size, IntArrayRef stride,
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index 178e73ca8d9..6c2a9f98471 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -256,7 +256,7 @@ static std::tuple<Tensor, Tensor> expand_bdims(
       b_has_bdim ? b : b.expand_as(flagpole));
 }
 
-static std::tuple<Tensor,optional<int64_t>> _softmax_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
     const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
     const Tensor& output, std::optional<int64_t> output_bdim,
     int64_t dim,
@@ -286,7 +286,7 @@ static std::tuple<Tensor,optional<int64_t>> _softmax_backward_batch_rule(
   return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> _log_softmax_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> _log_softmax_backward_batch_rule(
     const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
     const Tensor& output, std::optional<int64_t> output_bdim,
     int64_t dim,
@@ -314,7 +314,7 @@ static std::tuple<Tensor,optional<int64_t>> _log_softmax_backward_batch_rule(
   return std::make_tuple(at::_log_softmax_backward_data(grad_output_, output_, dim, input_dtype), 0);
 }
 
-static std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> searchsorted_batch_rule(
     const Tensor& sorted_sequence,
     std::optional<int64_t> sorted_sequence_bdim,
     const Tensor& self,
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 0b00233eae3..8626f4eb9fe 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -17,7 +17,7 @@
 namespace at::functorch {
 
 namespace {
-static bool any_has_value(ArrayRef<optional<int64_t>> bdims) {
+static bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
   for (const auto& bdim : bdims) {
     if (bdim.has_value()) {
       return true;
@@ -26,7 +26,7 @@ static bool any_has_value(ArrayRef<optional<int64_t>> bdims) {
   return false;
 }
 
-static int64_t get_num_leading_nones(ArrayRef<optional<Tensor>> indices) {
+static int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
   int64_t result = 0;
   for (const auto& idx : indices) {
     if (!idx.has_value() || !idx->defined()) {
@@ -39,8 +39,8 @@ static int64_t get_num_leading_nones(ArrayRef<optional<Tensor>> indices) {
 }
 
 static int64_t get_max_index_logical_dim(
-    ArrayRef<optional<Tensor>> indices,
-    ArrayRef<optional<int64_t>> indices_bdims) {
+    ArrayRef<std::optional<Tensor>> indices,
+    ArrayRef<std::optional<int64_t>> indices_bdims) {
   int64_t max_logical_dim = -1;
   TORCH_INTERNAL_ASSERT(indices.size() == indices_bdims.size());
   TORCH_INTERNAL_ASSERT(!indices.empty());
@@ -55,9 +55,9 @@ static int64_t get_max_index_logical_dim(
   return max_logical_dim;
 }
 
-static std::vector<optional<Tensor>> batchIndices(
-  ArrayRef<optional<Tensor>> indices,
-  ArrayRef<optional<int64_t>> indices_bdims,
+static std::vector<std::optional<Tensor>> batchIndices(
+  ArrayRef<std::optional<Tensor>> indices,
+  ArrayRef<std::optional<int64_t>> indices_bdims,
   int64_t batch_size,
   std::optional<int64_t> self_bdim,
   std::optional<int64_t> values_bdim = std::nullopt) {
@@ -82,7 +82,7 @@ static std::vector<optional<Tensor>> batchIndices(
   // There is one more case worth mentioning - boolean tensor indices. If we
   // have "batched" boolean tensor indices, that is unrepresentable, as each
   // batch would result in a tensor with different values.
-  std::vector<optional<Tensor>> indices_;
+  std::vector<std::optional<Tensor>> indices_;
 
   int64_t maxLogicalRank = get_max_index_logical_dim(indices, indices_bdims);
   bool indices_batched = any_has_value(indices_bdims);
@@ -133,7 +133,7 @@ static bool is_advanced_index(const std::optional<Tensor>& idx) {
 }
 
 // See NOTE: [advanced indices adjacent] for definition
-static bool are_advanced_indices_adjacent(ArrayRef<optional<Tensor>> indices) {
+static bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indices) {
   int64_t num_advanced_indices_regions = 0;
   bool in_advanced_indices_region = false;
   for (const auto& idx : indices) {
@@ -171,11 +171,11 @@ static Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int6
   return tensor.permute(permutation);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> index_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
-    ArrayRef<optional<Tensor>> indices,
-    ArrayRef<optional<int64_t>> indices_bdims) {
+    ArrayRef<std::optional<Tensor>> indices,
+    ArrayRef<std::optional<int64_t>> indices_bdims) {
 
   // NOTE: [advanced indexing (index.Tensor) batch rule]
   //
@@ -240,7 +240,7 @@ std::tuple<Tensor,optional<int64_t>> index_batch_rule(
   auto max_index_dim = get_max_index_logical_dim(indices, indices_bdims);
 
   // Step 2
-  auto res = at::index(self_, List<optional<Tensor>>(batched_indices));
+  auto res = at::index(self_, List<std::optional<Tensor>>(batched_indices));
 
   // Step 3: There are three cases (these match the cases outlined in batchIndices)
   bool self_batched = self_bdim.has_value();
@@ -315,8 +315,8 @@ std::tuple<Tensor,optional<int64_t>> index_batch_rule(
   return std::make_tuple(swap_regions(res, max_index_dim, num_leading_nones), 0);
 }
 
-// plumbing done since we don't support List<optional<Tensor>> in codegen
-Tensor index_plumbing(const Tensor & self, const List<optional<Tensor>> & indices
+// plumbing done since we don't support List<std::optional<Tensor>> in codegen
+Tensor index_plumbing(const Tensor & self, const List<std::optional<Tensor>> & indices
 ) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
@@ -326,8 +326,8 @@ Tensor index_plumbing(const Tensor & self, const List<optional<Tensor>> & indice
     return at::index(self, indices);
   }
   auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
-  std::vector<optional<Tensor>> indices_value;
-  std::vector<optional<int64_t>> indices_bdims;
+  std::vector<std::optional<Tensor>> indices_value;
+  std::vector<std::optional<int64_t>> indices_bdims;
   for (const auto&& indRef : indices) {
       std::optional<Tensor> ind = indRef;
       std::optional<Tensor> index;
@@ -399,11 +399,11 @@ namespace {
     return compute_indexed_shape(self, indices);
   }
 
-  std::tuple<Tensor, std::vector<optional<Tensor>>, Tensor>
+  std::tuple<Tensor, std::vector<std::optional<Tensor>>, Tensor>
   index_put_batch_rule_helper(const Tensor &self,
                               std::optional<int64_t> self_bdim,
-                              ArrayRef<optional<Tensor>> indices,
-                              ArrayRef<optional<int64_t>> indices_bdims,
+                              ArrayRef<std::optional<Tensor>> indices,
+                              ArrayRef<std::optional<int64_t>> indices_bdims,
                               const Tensor &values,
                               std::optional<int64_t> values_bdim,
                               std::optional<int64_t> opt_batch_size = {}) {
@@ -420,7 +420,7 @@ namespace {
     // we've already made sure that self has bdim at 0.
     const auto indices_ = batchIndices(indices, indices_bdims, batch_size, /*self_bdim=*/0, values_bdim);
 
-    auto indexed_shape = get_indexed_shape(self_, List<optional<Tensor>>(indices_));
+    auto indexed_shape = get_indexed_shape(self_, List<std::optional<Tensor>>(indices_));
 
     // handle broadcasting support for values
     // Eg. Given `indexed_shape.size()` is 5 and
@@ -452,12 +452,12 @@ namespace {
   }
 
   auto unpackSelfAndIndicesAndValuesAtCurrentLevel(const Tensor &self,
-                                                   const List<optional<Tensor>> &indices,
+                                                   const List<std::optional<Tensor>> &indices,
                                                    const Tensor &values, int64_t cur_level)
   {
     auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
-    std::vector<optional<Tensor>> indices_value;
-    std::vector<optional<int64_t>> indices_bdims;
+    std::vector<std::optional<Tensor>> indices_value;
+    std::vector<std::optional<int64_t>> indices_bdims;
     for (const auto &&indRef : indices)
     {
       std::optional<Tensor> ind = indRef;
@@ -478,8 +478,8 @@ namespace {
 void index_put__batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
-    ArrayRef<optional<Tensor>> indices,
-    ArrayRef<optional<int64_t>> indices_bdims,
+    ArrayRef<std::optional<Tensor>> indices,
+    ArrayRef<std::optional<int64_t>> indices_bdims,
     const Tensor& values,
     std::optional<int64_t> values_bdim,
     bool accumulate) {
@@ -488,11 +488,11 @@ void index_put__batch_rule(
   }
   auto [self_, indices_, values_] = index_put_batch_rule_helper(
       self, self_bdim, indices, indices_bdims, values, values_bdim);
-  at::index_put_(self_, List<optional<Tensor>>(indices_), values_, accumulate);
+  at::index_put_(self_, List<std::optional<Tensor>>(indices_), values_, accumulate);
 }
 
-// plumbing done since we don't support List<optional<Tensor>> in codegen
-Tensor& index_put__plumbing(Tensor & self, const List<optional<Tensor>> & indices
+// plumbing done since we don't support List<std::optional<Tensor>> in codegen
+Tensor& index_put__plumbing(Tensor & self, const List<std::optional<Tensor>> & indices
 , const Tensor & values, bool accumulate) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
@@ -517,8 +517,8 @@ Tensor& index_put__plumbing(Tensor & self, const List<optional<Tensor>> & indice
 void _index_put_impl__batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
-    ArrayRef<optional<Tensor>> indices,
-    ArrayRef<optional<int64_t>> indices_bdims,
+    ArrayRef<std::optional<Tensor>> indices,
+    ArrayRef<std::optional<int64_t>> indices_bdims,
     const Tensor& values,
     std::optional<int64_t> values_bdim,
     bool accumulate,
@@ -528,11 +528,11 @@ void _index_put_impl__batch_rule(
   }
   auto [self_, indices_, values_] = index_put_batch_rule_helper(
       self, self_bdim, indices, indices_bdims, values, values_bdim);
-  at::_index_put_impl_(self_, List<optional<Tensor>>(indices_), values_, accumulate, unsafe);
+  at::_index_put_impl_(self_, List<std::optional<Tensor>>(indices_), values_, accumulate, unsafe);
 }
 
-// plumbing done since we don't support List<optional<Tensor>> in codegen
-Tensor &_index_put_impl__plumbing(Tensor &self, const List<optional<Tensor>> &indices,
+// plumbing done since we don't support List<std::optional<Tensor>> in codegen
+Tensor &_index_put_impl__plumbing(Tensor &self, const List<std::optional<Tensor>> &indices,
                                   const Tensor &values, bool accumulate, bool unsafe) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
@@ -549,8 +549,8 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List<optional<Tensor>> &in
 
 static Tensor maybe_permute_values(
     const Tensor& values,
-    ArrayRef<optional<Tensor>> orig_indices,
-    ArrayRef<optional<int64_t>> orig_indices_bdims) {
+    ArrayRef<std::optional<Tensor>> orig_indices,
+    ArrayRef<std::optional<int64_t>> orig_indices_bdims) {
   bool indices_batched = any_has_value(orig_indices_bdims);
   bool advanced_indices_are_adjacent = are_advanced_indices_adjacent(orig_indices);
   auto num_leading_nones = get_num_leading_nones(orig_indices);
@@ -602,11 +602,11 @@ static Tensor maybe_permute_values(
   return swap_regions(values, num_leading_nones, max_index_dim);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_put_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> index_put_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
-    ArrayRef<optional<Tensor>> indices,
-    ArrayRef<optional<int64_t>> indices_bdims,
+    ArrayRef<std::optional<Tensor>> indices,
+    ArrayRef<std::optional<int64_t>> indices_bdims,
     const Tensor& values,
     std::optional<int64_t> values_bdim,
     bool accumulate) {
@@ -641,12 +641,12 @@ std::tuple<Tensor,optional<int64_t>> index_put_batch_rule(
   // and the batched `indices_` might change the "have adjacent advanced indices" property
   values_ = maybe_permute_values(values_, indices, indices_bdims);
 
-  auto result = at::index_put(self_, List<optional<Tensor>>(indices_), values_, accumulate);
+  auto result = at::index_put(self_, List<std::optional<Tensor>>(indices_), values_, accumulate);
   return std::make_tuple(result, 0);
 }
 
-// plumbing done since we don't support List<optional<Tensor>> in codegen
-Tensor index_put_plumbing(const Tensor & self, const List<optional<Tensor>> & indices,
+// plumbing done since we don't support List<std::optional<Tensor>> in codegen
+Tensor index_put_plumbing(const Tensor & self, const List<std::optional<Tensor>> & indices,
                           const Tensor & values, bool accumulate) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
@@ -671,7 +671,7 @@ Tensor index_put_plumbing(const Tensor & self, const List<optional<Tensor>> & in
 namespace {
 
 template<typename Func, typename ...Args>
-std::tuple<Tensor,optional<int64_t>> scatter_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> scatter_batch_rule(
     Func f,
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
@@ -703,7 +703,7 @@ std::tuple<Tensor,optional<int64_t>> scatter_batch_rule(
 }
 
 template <typename Func, typename ...Args>
-inline std::tuple<Tensor,optional<int64_t>> scatter_batch_rule(
+inline std::tuple<Tensor, std::optional<int64_t>> scatter_batch_rule(
     Func f,
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
@@ -742,7 +742,7 @@ inline std::tuple<Tensor,optional<int64_t>> scatter_batch_rule(
 
 } // namespace
 
-std::tuple<Tensor,optional<int64_t>> scatter_value_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> scatter_value_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -751,7 +751,7 @@ std::tuple<Tensor,optional<int64_t>> scatter_value_batch_rule(
                             self, self_bdim, dim, index, index_bdim, value);
 }
 
-std::tuple<Tensor,optional<int64_t>> scatter_src_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> scatter_src_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -760,7 +760,7 @@ std::tuple<Tensor,optional<int64_t>> scatter_src_batch_rule(
                             self, self_bdim, dim, index, index_bdim, src, src_bdim);
 }
 
-std::tuple<Tensor,optional<int64_t>> scatter_add_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> scatter_add_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -769,7 +769,7 @@ std::tuple<Tensor,optional<int64_t>> scatter_add_batch_rule(
                             self, self_bdim, dim, index, index_bdim, src, src_bdim);
 }
 
-std::tuple<Tensor,optional<int64_t>> scatter_reduce_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -779,7 +779,7 @@ std::tuple<Tensor,optional<int64_t>> scatter_reduce_batch_rule(
                             self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce);
 }
 
-std::tuple<Tensor,optional<int64_t>> scatter_value_reduce_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> scatter_value_reduce_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -789,7 +789,7 @@ std::tuple<Tensor,optional<int64_t>> scatter_value_reduce_batch_rule(
                             self, self_bdim, dim, index, index_bdim, src, reduce);
 }
 
-std::tuple<Tensor,optional<int64_t>> gather_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> gather_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -922,7 +922,7 @@ std::tuple<Tensor, std::optional<int64_t>> diagonal_scatter_batch_rule(
   return std::make_tuple(at::diagonal_scatter(self_, src_, offset, dim1, dim2), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_add_batch_rule_impl(
+std::tuple<Tensor, std::optional<int64_t>> index_add_batch_rule_impl(
     Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -1004,7 +1004,7 @@ void index_add__batch_rule(
                             other_bdim, alpha, true);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_add_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> index_add_batch_rule(
     Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor& index, std::optional<int64_t> index_bdim,
@@ -1038,7 +1038,7 @@ static std::tuple<Tensor,Tensor> binary_pointwise_align(
   return std::make_tuple(tensor_, other_);
 }
 
-std::tuple<Tensor,optional<int64_t>> masked_fill_scalar_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> masked_fill_scalar_batch_rule(
     const Tensor & self,
     std::optional<int64_t> self_bdim,
     const Tensor & mask,
@@ -1049,7 +1049,7 @@ std::tuple<Tensor,optional<int64_t>> masked_fill_scalar_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_fill_batch_rule_helper(
+std::tuple<Tensor, std::optional<int64_t>> index_fill_batch_rule_helper(
   int64_t batch_size,
   int64_t self_logical_rank,
   int64_t index_logical_rank,
@@ -1085,7 +1085,7 @@ std::tuple<Tensor,optional<int64_t>> index_fill_batch_rule_helper(
   return std::make_tuple(self_, 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_fill_int_scalar_batch_rule_impl(
+std::tuple<Tensor, std::optional<int64_t>> index_fill_int_scalar_batch_rule_impl(
     Tensor & self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor & index, std::optional<int64_t> index_bdim,
@@ -1136,7 +1136,7 @@ std::tuple<Tensor,optional<int64_t>> index_fill_int_scalar_batch_rule_impl(
   return index_fill_batch_rule_helper(batch_size, self_logical_rank, index_logical_rank, self_, dim, index_, value);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_fill_int_tensor_batch_rule_impl(
+std::tuple<Tensor, std::optional<int64_t>> index_fill_int_tensor_batch_rule_impl(
     Tensor & self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor & index, std::optional<int64_t> index_bdim,
@@ -1207,7 +1207,7 @@ void index_fill__int_tensor_batch_rule(
   index_fill_int_tensor_batch_rule_impl(self, self_bdim, dim, index, index_bdim, value, value_bdim, true);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_fill_int_scalar_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> index_fill_int_scalar_batch_rule(
     const Tensor & self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor & index, std::optional<int64_t> index_bdim,
@@ -1216,7 +1216,7 @@ std::tuple<Tensor,optional<int64_t>> index_fill_int_scalar_batch_rule(
   return index_fill_int_scalar_batch_rule_impl(self_, self_bdim, dim, index, index_bdim, value, false);
 }
 
-std::tuple<Tensor,optional<int64_t>> index_fill_int_tensor_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> index_fill_int_tensor_batch_rule(
     const Tensor & self, std::optional<int64_t> self_bdim,
     int64_t dim,
     const Tensor & index, std::optional<int64_t> index_bdim,
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index a4a4f378bdf..b26d2c4a419 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -10,7 +10,7 @@
 namespace at::functorch {
 
 namespace{
-std::tuple<Tensor,optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>>
 clone_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
@@ -48,7 +48,7 @@ clone_batch_rule(
   return std::make_tuple(result, self_bdim);
 }
 
-std::tuple<Tensor,optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>>
 view_as_complex_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim) {
   // guard against the user passing in a batch of scalar tensors with batch
   // size equal to 2.
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 5c6455f1682..966cad36354 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -36,7 +36,7 @@ namespace at::functorch {
 // `Tensor sum(const Tensor& self, int64_t dim)`. The signature of the
 // batch rule has an additional std::optional<int64_t> argument after each
 // Tensor argument and return. So, in this case, the batch rule has signature
-//   tuple<Tensor,optional<int64_t>> sum_batch_rule(
+//   tuple<Tensor, std::optional<int64_t>> sum_batch_rule(
 //       const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim);
 //
 // The vmap call above invokes the batch rule with `self = tensor`,
@@ -90,7 +90,7 @@ namespace at::functorch {
 
 namespace{
 
-std::tuple<Tensor,optional<int64_t>> unsqueeze_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> unsqueeze_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     int64_t dim) {
@@ -101,7 +101,7 @@ std::tuple<Tensor,optional<int64_t>> unsqueeze_batch_rule(
 }
 
 // NB: repeat is not actually a view, but it is in this file
-std::tuple<Tensor,optional<int64_t>> repeat_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> repeat_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     c10::SymIntArrayRef sizes) {
@@ -116,7 +116,7 @@ std::tuple<Tensor,optional<int64_t>> repeat_batch_rule(
 }
 
 
-std::tuple<Tensor,optional<int64_t>> _unsafe_view_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> _unsafe_view_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     c10::SymIntArrayRef size) {
@@ -137,7 +137,7 @@ std::tuple<Tensor,optional<int64_t>> _unsafe_view_batch_rule(
   return std::make_tuple(at::_unsafe_view_symint(self_, view_size), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> flip_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim, IntArrayRef dims) {
+std::tuple<Tensor, std::optional<int64_t>> flip_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim, IntArrayRef dims) {
   auto self_ = moveBatchDimToFront(self, self_bdim);
   VmapDimVector new_dims;
   for (auto i: dims) {
@@ -317,7 +317,7 @@ std::tuple<Tensor, std::optional<int64_t>> diagonal_batching_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> diagonal_backward_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> diagonal_backward_batch_rule(
     const Tensor& grad_input, std::optional<int64_t> grad_input_bdim,
     c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
   auto logical_rank = rankWithoutBatchDim(grad_input, grad_input_bdim);
@@ -331,7 +331,7 @@ std::tuple<Tensor,optional<int64_t>> diagonal_backward_batch_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> slice_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> slice_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     int64_t dim,
@@ -349,7 +349,7 @@ static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
 }
 
-std::tuple<Tensor,optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>>
 transpose_int_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
@@ -389,7 +389,7 @@ std::tuple<Tensor, std::optional<int64_t>> permute_batching_rule(
   return std::make_tuple(self_.permute(dims_), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> select_backward_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> select_backward_batch_rule(
     const Tensor& grad_input, std::optional<int64_t> grad_input_bdim,
     c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
   auto logical_rank = rankWithoutBatchDim(grad_input, grad_input_bdim);
@@ -402,7 +402,7 @@ std::tuple<Tensor,optional<int64_t>> select_backward_batch_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> slice_backward_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> slice_backward_batch_rule(
     const Tensor& grad_input, std::optional<int64_t> grad_input_bdim,
     SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step) {
   auto logical_rank = rankWithoutBatchDim(grad_input, grad_input_bdim);
@@ -427,7 +427,7 @@ std::tuple<Tensor, std::optional<int64_t>> view_batching_rule(
   return std::make_tuple(self_.view_symint(size_), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> view_copy_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> view_copy_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     c10::SymIntArrayRef size) {
@@ -530,7 +530,7 @@ Tensor trace_decomp(const Tensor& tensor) {
   return tensor.diagonal().sum();
 }
 
-std::tuple<Tensor,optional<int64_t>> tril_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     int64_t diagonal = 0) {
@@ -540,7 +540,7 @@ std::tuple<Tensor,optional<int64_t>> tril_batch_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> triu_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
     const Tensor& self,
     std::optional<int64_t> self_bdim,
     int64_t diagonal = 0) {
diff --git a/aten/src/ATen/functorch/BatchingMetaprogramming.h b/aten/src/ATen/functorch/BatchingMetaprogramming.h
index 7d1694466f7..7b9c2aa151e 100644
--- a/aten/src/ATen/functorch/BatchingMetaprogramming.h
+++ b/aten/src/ATen/functorch/BatchingMetaprogramming.h
@@ -53,7 +53,7 @@ struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<Tensor&, std::optional
   using type = Tail;
 };
 template <class Next, class Tail>
-struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<optional<Tensor>, std::optional<int64_t>, Next, Tail> {
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<std::optional<Tensor>, std::optional<int64_t>, Next, Tail> {
   using type = Tail;
 };
 template <class Next, class Tail>
@@ -61,7 +61,7 @@ struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<const std::optional<Te
   using type = Tail;
 };
 template <class Next, class Tail>
-struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<optional<Tensor>&, std::optional<int64_t>, Next, Tail> {
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<std::optional<Tensor>&, std::optional<int64_t>, Next, Tail> {
   using type = Tail;
 };
 template <class Next, class Tail>
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index bda774106a3..e369c4a590c 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -175,7 +175,7 @@ const std::shared_ptr<bool>& getLifeHandleForLevel(int64_t level) {
   return dynamic_layer.interpreter().is_alive_ptr();
 }
 
-optional<DynamicLayer> maybeCurrentDynamicLayer() {
+std::optional<DynamicLayer> maybeCurrentDynamicLayer() {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   if (dynamicLayerStack.empty()) {
     return {};
diff --git a/aten/src/ATen/functorch/PlumbingHelper.cpp b/aten/src/ATen/functorch/PlumbingHelper.cpp
index 95c9ea5f350..197d43fc9b8 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.cpp
+++ b/aten/src/ATen/functorch/PlumbingHelper.cpp
@@ -82,7 +82,7 @@ bool isBatchedAtLevel(const c10::List<std::optional<Tensor>>& maybe_tensors, int
   return false;
 }
 
-bool areAnyBatchedAtLevel(ArrayRef<optional<Tensor>> maybe_tensors, int64_t level) {
+bool areAnyBatchedAtLevel(ArrayRef<std::optional<Tensor>> maybe_tensors, int64_t level) {
   for (const auto& maybe_tensor : maybe_tensors) {
     if (isBatchedAtLevel(maybe_tensor, level)) {
       return true;
diff --git a/aten/src/ATen/functorch/PlumbingHelper.h b/aten/src/ATen/functorch/PlumbingHelper.h
index 5de6e90701e..c22793ef49d 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.h
+++ b/aten/src/ATen/functorch/PlumbingHelper.h
@@ -47,7 +47,7 @@ TORCH_API bool isBatchedAtLevel(const Tensor& tensor, int64_t level);
 TORCH_API bool isBatchedAtLevel(const std::optional<Tensor>& maybe_tensor, int64_t level);
 
 // Convenience helper. Returns true if any tensor is batched at level
-TORCH_API bool areAnyBatchedAtLevel(ArrayRef<optional<Tensor>> maybe_tensors, int64_t level);
+TORCH_API bool areAnyBatchedAtLevel(ArrayRef<std::optional<Tensor>> maybe_tensors, int64_t level);
 
 inline bool ivalueParticipatesInCurrentLevel(const IValue& ivalue) {
   if (ivalue.isTensor()) {
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index a9082b7d24f..06111933571 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -156,7 +156,7 @@
 namespace at {
 
 namespace detail {
-  static void check_linalg_norm_dtype(optional<ScalarType> opt_dtype, ScalarType self_dtype, const char* const name) {
+  static void check_linalg_norm_dtype(std::optional<ScalarType> opt_dtype, ScalarType self_dtype, const char* const name) {
     if (opt_dtype.has_value()) {
       auto dtype = opt_dtype.value();
       TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype), name, ": dtype should"
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 5e37898390a..dc0c1054d16 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -220,7 +220,7 @@ static inline Device ensure_has_index(Device device) {
   return impl->getDevice();
 }
 
-static inline std::optional<Device> ensure_has_index(optional<Device> device) {
+static inline std::optional<Device> ensure_has_index(std::optional<Device> device) {
   if (!device.has_value()) {
     return std::nullopt;
   }
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index cf2741ccf3c..2efe7962f9b 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -904,7 +904,7 @@ _flash_attention_forward(
   std::optional<Tensor> out = std::nullopt;
 
   std::optional<Tensor> seqused_k = _seqused_k;
-  c10::optional<at::Tensor> block_table = std::nullopt;  // we are not using the block table yet
+  std::optional<at::Tensor> block_table = std::nullopt;  // we are not using the block table yet
   std::optional<Tensor> alibi_slopes = _alibi_slopes;
 
   const int non_null_window_left = window_size_left.has_value() ? window_size_left.value() : -1;
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
index a1ccc08b690..4274ce38fb3 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@@ -547,7 +547,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
                std::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               c10::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+               std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
                std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                int max_seqlen_q,
                const int max_seqlen_k,
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
index a6104407671..ea5f577d5a2 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
@@ -29,7 +29,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
                std::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               c10::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+               std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
                std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                int max_seqlen_q,
                const int max_seqlen_k,
diff --git a/c10/core/DeviceGuard.h b/c10/core/DeviceGuard.h
index 8f8b7cad523..94b89bc31b7 100644
--- a/c10/core/DeviceGuard.h
+++ b/c10/core/DeviceGuard.h
@@ -135,7 +135,7 @@ class OptionalDeviceGuard {
 
   /// Initialize the guard if a Device is passed; otherwise leave the
   /// guard uninitialized.
-  explicit OptionalDeviceGuard(optional<Device> device) : guard_(device) {}
+  explicit OptionalDeviceGuard(std::optional<Device> device) : guard_(device) {}
 
   /// Constructor for testing only.
   explicit OptionalDeviceGuard(
diff --git a/c10/core/StreamGuard.h b/c10/core/StreamGuard.h
index 2f865d03161..db6dbd88cbd 100644
--- a/c10/core/StreamGuard.h
+++ b/c10/core/StreamGuard.h
@@ -99,7 +99,7 @@ struct OptionalStreamGuard {
   /// Set the current device to the device associated with the passed stream,
   /// and set the current stream on that device to the passed stream,
   /// if the passed stream is not nullopt.
-  explicit OptionalStreamGuard(optional<Stream> stream_opt)
+  explicit OptionalStreamGuard(std::optional<Stream> stream_opt)
       : guard_(stream_opt) {}
 
   /// Copy is disallowed
diff --git a/c10/core/impl/InlineDeviceGuard.h b/c10/core/impl/InlineDeviceGuard.h
index d27c45c0f60..e0c6d4f1ca8 100644
--- a/c10/core/impl/InlineDeviceGuard.h
+++ b/c10/core/impl/InlineDeviceGuard.h
@@ -223,7 +223,7 @@ class InlineOptionalDeviceGuard {
   {}
 
   /// Set the current device to the passed Device, if it is not nullopt.
-  explicit InlineOptionalDeviceGuard(optional<Device> device_opt)
+  explicit InlineOptionalDeviceGuard(std::optional<Device> device_opt)
       : guard_() { // See Note [Explicit initialization of optional fields]
     if (device_opt.has_value()) {
       guard_.emplace(device_opt.value());
@@ -235,7 +235,8 @@ class InlineOptionalDeviceGuard {
       typename U = T,
       typename =
           typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
-  explicit InlineOptionalDeviceGuard(optional<DeviceIndex> device_index_opt)
+  explicit InlineOptionalDeviceGuard(
+      std::optional<DeviceIndex> device_index_opt)
       : guard_() { // See Note [Explicit initialization of optional fields]
     if (device_index_opt.has_value()) {
       guard_.emplace(device_index_opt.value());
diff --git a/c10/core/impl/InlineStreamGuard.h b/c10/core/impl/InlineStreamGuard.h
index 6b4596ae742..6d2b3c70678 100644
--- a/c10/core/impl/InlineStreamGuard.h
+++ b/c10/core/impl/InlineStreamGuard.h
@@ -139,7 +139,8 @@ class InlineOptionalStreamGuard {
   /// Set the current device to the device associated with the passed stream,
   /// and set the current stream on that device to the passed stream,
   /// if the passed stream is not nullopt.
-  explicit InlineOptionalStreamGuard(optional<Stream> stream_opt) : guard_() {
+  explicit InlineOptionalStreamGuard(std::optional<Stream> stream_opt)
+      : guard_() {
     if (stream_opt.has_value()) {
       guard_.emplace(stream_opt.value());
     }
diff --git a/c10/cuda/CUDAGuard.h b/c10/cuda/CUDAGuard.h
index b3110d011aa..08b7bb71137 100644
--- a/c10/cuda/CUDAGuard.h
+++ b/c10/cuda/CUDAGuard.h
@@ -76,12 +76,12 @@ struct OptionalCUDAGuard {
   explicit OptionalCUDAGuard() : guard_() {}
 
   /// Set the current CUDA device to the passed Device, if it is not nullopt.
-  explicit OptionalCUDAGuard(optional<Device> device_opt)
+  explicit OptionalCUDAGuard(std::optional<Device> device_opt)
       : guard_(device_opt) {}
 
   /// Set the current CUDA device to the passed device index, if it is not
   /// nullopt
-  explicit OptionalCUDAGuard(optional<DeviceIndex> device_index_opt)
+  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt)
       : guard_(device_index_opt) {}
 
   // Copy is not allowed
@@ -215,7 +215,7 @@ struct OptionalCUDAStreamGuard {
   /// Set the current device to the device associated with the passed stream,
   /// and set the current stream on that device to the passed stream,
   /// if the passed stream is not nullopt.
-  explicit OptionalCUDAStreamGuard(optional<Stream> stream_opt)
+  explicit OptionalCUDAStreamGuard(std::optional<Stream> stream_opt)
       : guard_(stream_opt) {}
 
   /// Copy is disallowed
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index 8acf294da22..1c62bc480e5 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -37,7 +37,7 @@ constexpr T value_or_else(const std::optional<T>& v, F&& func) {
 }
 
 template <class T, class F>
-constexpr T value_or_else(optional<T>&& v, F&& func) {
+constexpr T value_or_else(std::optional<T>&& v, F&& func) {
   static_assert(
       std::is_convertible_v<typename std::invoke_result_t<F>, T>,
       "func parameters must be a callable that returns a type convertible to the value stored in the optional");
diff --git a/torch/csrc/api/include/torch/data/dataloader/base.h b/torch/csrc/api/include/torch/data/dataloader/base.h
index 677136dfeb1..cb17843ba0b 100644
--- a/torch/csrc/api/include/torch/data/dataloader/base.h
+++ b/torch/csrc/api/include/torch/data/dataloader/base.h
@@ -121,7 +121,7 @@ class DataLoaderBase {
   /// The finished result of a job.
   struct Result : Sequenced {
     Result() = default;
-    Result(optional<Batch>&& b, size_t sqn)
+    Result(std::optional<Batch>&& b, size_t sqn)
         : Sequenced(sqn), batch(std::move(b)) {}
     Result(std::exception_ptr exception, size_t sqn)
         : Sequenced(sqn), exception(std::move(exception)) {}
@@ -166,7 +166,7 @@ class DataLoaderBase {
   /// is still expected.
   std::optional<BatchType> next() {
     if (options_.workers > 0) {
-      while (optional<Result> result = this->pop_result()) {
+      while (std::optional<Result> result = this->pop_result()) {
         if (result->exception) {
           throw WorkerException(result->exception);
         } else if (result->batch) {
diff --git a/torch/csrc/api/include/torch/data/dataloader_options.h b/torch/csrc/api/include/torch/data/dataloader_options.h
index cab3e64a6bb..a0c96aee077 100644
--- a/torch/csrc/api/include/torch/data/dataloader_options.h
+++ b/torch/csrc/api/include/torch/data/dataloader_options.h
@@ -24,10 +24,10 @@ struct DataLoaderOptions {
 
   /// The maximum number of jobs to enqueue for fetching by worker threads.
   /// Defaults to two times the number of worker threads.
-  TORCH_ARG(optional<size_t>, max_jobs);
+  TORCH_ARG(std::optional<size_t>, max_jobs);
 
   /// An optional limit on the time to wait for the next batch.
-  TORCH_ARG(optional<std::chrono::milliseconds>, timeout);
+  TORCH_ARG(std::optional<std::chrono::milliseconds>, timeout);
 
   /// Whether to enforce ordering of batches when multiple are loaded
   /// asynchronously by worker threads. Set to `false` for better performance if
diff --git a/torch/csrc/api/include/torch/data/datasets/base.h b/torch/csrc/api/include/torch/data/datasets/base.h
index f3244642524..f17b3fe8af4 100644
--- a/torch/csrc/api/include/torch/data/datasets/base.h
+++ b/torch/csrc/api/include/torch/data/datasets/base.h
@@ -29,7 +29,7 @@ namespace detail {
 template <typename T>
 struct is_optional : std::false_type {};
 template <typename T>
-struct is_optional<optional<T>> : std::true_type {};
+struct is_optional<std::optional<T>> : std::true_type {};
 } // namespace detail
 
 /// A dataset that can yield data only in batches.
@@ -49,7 +49,8 @@ class BatchDataset {
   /// Returns a batch of data given an index.
   virtual Batch get_batch(BatchRequest request) = 0;
 
-  /// Returns the size of the dataset, or an empty optional if it is unsized.
+  /// Returns the size of the dataset, or an empty std::optional if it is
+  /// unsized.
   virtual std::optional<size_t> size() const = 0;
 
   /// Creates a `MapDataset` that applies the given `transform` to this dataset.
diff --git a/torch/csrc/api/include/torch/data/detail/queue.h b/torch/csrc/api/include/torch/data/detail/queue.h
index 448bd26c04c..60236ab3f52 100644
--- a/torch/csrc/api/include/torch/data/detail/queue.h
+++ b/torch/csrc/api/include/torch/data/detail/queue.h
@@ -40,7 +40,7 @@ class Queue {
   /// the queue. An optional `timeout` in seconds can be used to limit the time
   /// spent waiting for an element. If the wait times out, an exception is
   /// raised.
-  T pop(optional<std::chrono::milliseconds> timeout = std::nullopt) {
+  T pop(std::optional<std::chrono::milliseconds> timeout = std::nullopt) {
     std::unique_lock<std::mutex> lock(mutex_);
     if (timeout) {
       if (!cv_.wait_for(
diff --git a/torch/csrc/api/include/torch/data/detail/sequencers.h b/torch/csrc/api/include/torch/data/detail/sequencers.h
index 5e7679cb9b4..c59f4cd7e29 100644
--- a/torch/csrc/api/include/torch/data/detail/sequencers.h
+++ b/torch/csrc/api/include/torch/data/detail/sequencers.h
@@ -12,7 +12,7 @@ namespace detail {
 namespace sequencers {
 namespace detail {
 template <typename Result>
-bool buffer_contains_result(const std::vector<optional<Result>>& buffer) {
+bool buffer_contains_result(const std::vector<std::optional<Result>>& buffer) {
   return std::any_of(
       buffer.begin(), buffer.end(), [](const std::optional<Result>& result) {
         return result.has_value();
@@ -27,7 +27,7 @@ bool buffer_contains_result(const std::vector<optional<Result>>& buffer) {
 /// buffers results internally to return them in order of their sequence number.
 template <typename Result>
 struct Sequencer {
-  using ResultProducer = std::function<optional<Result>()>;
+  using ResultProducer = std::function<std::optional<Result>()>;
   virtual ~Sequencer() = default;
   virtual std::optional<Result> next(ResultProducer next_result) = 0;
 };
@@ -105,7 +105,7 @@ struct OrderedSequencer : public Sequencer<Result> {
   size_t next_sequence_number_ = 0;
 
   /// A fixed-size buffer (after construction).
-  std::vector<optional<Result>> buffer_;
+  std::vector<std::optional<Result>> buffer_;
 };
 } // namespace sequencers
 } // namespace detail
diff --git a/torch/csrc/api/include/torch/data/iterator.h b/torch/csrc/api/include/torch/data/iterator.h
index 8f5aaa9c8fb..94293c452d5 100644
--- a/torch/csrc/api/include/torch/data/iterator.h
+++ b/torch/csrc/api/include/torch/data/iterator.h
@@ -41,7 +41,7 @@ struct IteratorImpl {
 
 template <typename Batch>
 struct ValidIterator : public IteratorImpl<Batch> {
-  using BatchProducer = std::function<optional<Batch>()>;
+  using BatchProducer = std::function<std::optional<Batch>()>;
 
   explicit ValidIterator(BatchProducer next_batch)
       : next_batch_(std::move(next_batch)) {}
diff --git a/torch/csrc/api/include/torch/data/samplers/base.h b/torch/csrc/api/include/torch/data/samplers/base.h
index c8e94264e7c..8ab48d9d593 100644
--- a/torch/csrc/api/include/torch/data/samplers/base.h
+++ b/torch/csrc/api/include/torch/data/samplers/base.h
@@ -29,7 +29,7 @@ class Sampler {
   /// Resets the `Sampler`'s internal state.
   /// Typically called before a new epoch.
   /// Optionally, accepts a new size when reseting the sampler.
-  virtual void reset(optional<size_t> new_size) = 0;
+  virtual void reset(std::optional<size_t> new_size) = 0;
 
   /// Returns the next index if possible, or an empty optional if the
   /// sampler is exhausted for this epoch.
diff --git a/torch/csrc/api/include/torch/data/samplers/distributed.h b/torch/csrc/api/include/torch/data/samplers/distributed.h
index dd2c2241388..bce36aaa4df 100644
--- a/torch/csrc/api/include/torch/data/samplers/distributed.h
+++ b/torch/csrc/api/include/torch/data/samplers/distributed.h
@@ -78,7 +78,7 @@ class TORCH_API DistributedRandomSampler : public DistributedSampler<> {
       bool allow_duplicates = true);
 
   /// Resets the `DistributedRandomSampler` to a new set of indices.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns the next batch of indices.
   std::optional<std::vector<size_t>> next(size_t batch_size) override;
@@ -111,7 +111,7 @@ class TORCH_API DistributedSequentialSampler : public DistributedSampler<> {
       bool allow_duplicates = true);
 
   /// Resets the `DistributedSequentialSampler` to a new set of indices.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns the next batch of indices.
   std::optional<std::vector<size_t>> next(size_t batch_size) override;
diff --git a/torch/csrc/api/include/torch/data/samplers/random.h b/torch/csrc/api/include/torch/data/samplers/random.h
index 2a6a473798c..4b023b6c703 100644
--- a/torch/csrc/api/include/torch/data/samplers/random.h
+++ b/torch/csrc/api/include/torch/data/samplers/random.h
@@ -31,7 +31,7 @@ class TORCH_API RandomSampler : public Sampler<> {
   ~RandomSampler() override;
 
   /// Resets the `RandomSampler` to a new set of indices.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns the next batch of indices.
   std::optional<std::vector<size_t>> next(size_t batch_size) override;
diff --git a/torch/csrc/api/include/torch/data/samplers/sequential.h b/torch/csrc/api/include/torch/data/samplers/sequential.h
index b27ab268b0b..252ecc3ad3d 100644
--- a/torch/csrc/api/include/torch/data/samplers/sequential.h
+++ b/torch/csrc/api/include/torch/data/samplers/sequential.h
@@ -26,7 +26,7 @@ class TORCH_API SequentialSampler : public Sampler<> {
   explicit SequentialSampler(size_t size);
 
   /// Resets the `SequentialSampler` to zero.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns the next batch of indices.
   std::optional<std::vector<size_t>> next(size_t batch_size) override;
diff --git a/torch/csrc/api/include/torch/data/samplers/stream.h b/torch/csrc/api/include/torch/data/samplers/stream.h
index d534681e112..201c914e49e 100644
--- a/torch/csrc/api/include/torch/data/samplers/stream.h
+++ b/torch/csrc/api/include/torch/data/samplers/stream.h
@@ -39,7 +39,7 @@ class TORCH_API StreamSampler : public Sampler<BatchSize> {
   explicit StreamSampler(size_t epoch_size);
 
   /// Resets the internal state of the sampler.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns a `BatchSize` object with the number of elements to fetch in the
   /// next batch. This number is the minimum of the supplied `batch_size` and
diff --git a/torch/csrc/api/include/torch/nn/modules/container/any.h b/torch/csrc/api/include/torch/nn/modules/container/any.h
index 4e8fc760ac7..ab4a589aede 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/any.h
@@ -136,7 +136,7 @@ class AnyModule {
 
   /// Creates a deep copy of an `AnyModule` if it contains a module, else an
   /// empty `AnyModule` if it is empty.
-  AnyModule clone(optional<Device> device = std::nullopt) const;
+  AnyModule clone(std::optional<Device> device = std::nullopt) const;
 
   /// Assigns a module to the `AnyModule` (to circumvent the explicit
   /// constructor).
@@ -253,7 +253,7 @@ inline AnyModule& AnyModule::operator=(const AnyModule& other) {
   return *this;
 }
 
-inline AnyModule AnyModule::clone(optional<Device> device) const {
+inline AnyModule AnyModule::clone(std::optional<Device> device) const {
   AnyModule clone;
   clone.content_ = content_ ? content_->clone_module(device) : nullptr;
   return clone;
diff --git a/torch/csrc/api/src/data/datasets/mnist.cpp b/torch/csrc/api/src/data/datasets/mnist.cpp
index f416009c618..ff9f5c351e8 100644
--- a/torch/csrc/api/src/data/datasets/mnist.cpp
+++ b/torch/csrc/api/src/data/datasets/mnist.cpp
@@ -8,7 +8,6 @@
 #include <cstddef>
 #include <fstream>
 #include <string>
-#include <vector>
 
 namespace torch {
 namespace data {
@@ -105,7 +104,7 @@ Example<> MNIST::get(size_t index) {
   return {images_[index], targets_[index]};
 }
 
-optional<size_t> MNIST::size() const {
+std::optional<size_t> MNIST::size() const {
   return images_.size(0);
 }
 
diff --git a/torch/csrc/api/src/data/samplers/distributed.cpp b/torch/csrc/api/src/data/samplers/distributed.cpp
index 4bcd938a0c1..eaae80bf069 100644
--- a/torch/csrc/api/src/data/samplers/distributed.cpp
+++ b/torch/csrc/api/src/data/samplers/distributed.cpp
@@ -22,11 +22,10 @@ DistributedRandomSampler::DistributedRandomSampler(
       end_index_(0),
       sample_index_(0) {
   // shuffle first time.
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset(size_);
 }
 
-optional<std::vector<size_t>> DistributedRandomSampler::next(
+std::optional<std::vector<size_t>> DistributedRandomSampler::next(
     size_t batch_size) {
   if (sample_index_ == end_index_) {
     return nullopt;
@@ -43,7 +42,7 @@ optional<std::vector<size_t>> DistributedRandomSampler::next(
   return res;
 }
 
-void DistributedRandomSampler::reset(optional<size_t> new_size) {
+void DistributedRandomSampler::reset(std::optional<size_t> new_size) {
   size_ = new_size.value_or(size_);
   populate_indices();
 
@@ -107,7 +106,7 @@ DistributedSequentialSampler::DistributedSequentialSampler(
   populate_indices();
 }
 
-optional<std::vector<size_t>> DistributedSequentialSampler::next(
+std::optional<std::vector<size_t>> DistributedSequentialSampler::next(
     size_t batch_size) {
   if (sample_index_ == end_index_) {
     return nullopt;
@@ -129,7 +128,7 @@ optional<std::vector<size_t>> DistributedSequentialSampler::next(
   return res;
 }
 
-void DistributedSequentialSampler::reset(optional<size_t> new_size) {
+void DistributedSequentialSampler::reset(std::optional<size_t> new_size) {
   size_t size = new_size.value_or(size_);
   if (size != size_) {
     size_ = size;
diff --git a/torch/csrc/api/src/data/samplers/random.cpp b/torch/csrc/api/src/data/samplers/random.cpp
index fdb1fc03828..10c478aa38d 100644
--- a/torch/csrc/api/src/data/samplers/random.cpp
+++ b/torch/csrc/api/src/data/samplers/random.cpp
@@ -14,7 +14,7 @@ RandomSampler::RandomSampler(int64_t size, Dtype index_dtype)
 
 RandomSampler::~RandomSampler() = default;
 
-void RandomSampler::reset(optional<size_t> new_size) {
+void RandomSampler::reset(std::optional<size_t> new_size) {
   // This allocates a new chunk of memory every time (just FYI). It should be
   // amortized over the entire epoch hopefully.
   const auto size = new_size.value_or(static_cast<size_t>(indices_.numel()));
diff --git a/torch/csrc/api/src/data/samplers/sequential.cpp b/torch/csrc/api/src/data/samplers/sequential.cpp
index 9c294cb106f..64cf0f5e0a6 100644
--- a/torch/csrc/api/src/data/samplers/sequential.cpp
+++ b/torch/csrc/api/src/data/samplers/sequential.cpp
@@ -11,14 +11,14 @@ namespace data {
 namespace samplers {
 SequentialSampler::SequentialSampler(size_t size) : size_(size) {}
 
-void SequentialSampler::reset(optional<size_t> new_size) {
+void SequentialSampler::reset(std::optional<size_t> new_size) {
   if (new_size.has_value()) {
     size_ = *new_size;
   }
   index_ = 0;
 }
 
-optional<std::vector<size_t>> SequentialSampler::next(size_t batch_size) {
+std::optional<std::vector<size_t>> SequentialSampler::next(size_t batch_size) {
   const auto remaining_indices = size_ - index_;
   if (remaining_indices == 0) {
     return nullopt;
diff --git a/torch/csrc/api/src/data/samplers/stream.cpp b/torch/csrc/api/src/data/samplers/stream.cpp
index 6972846783a..bce63f13eae 100644
--- a/torch/csrc/api/src/data/samplers/stream.cpp
+++ b/torch/csrc/api/src/data/samplers/stream.cpp
@@ -20,14 +20,14 @@ BatchSize::operator size_t() const noexcept {
 
 StreamSampler::StreamSampler(size_t epoch_size) : epoch_size_(epoch_size) {}
 
-void StreamSampler::reset(optional<size_t> new_size) {
+void StreamSampler::reset(std::optional<size_t> new_size) {
   if (new_size.has_value()) {
     epoch_size_ = *new_size;
   }
   examples_retrieved_so_far_ = 0;
 }
 
-optional<BatchSize> StreamSampler::next(size_t batch_size) {
+std::optional<BatchSize> StreamSampler::next(size_t batch_size) {
   AT_ASSERT(examples_retrieved_so_far_ <= epoch_size_);
   if (examples_retrieved_so_far_ == epoch_size_) {
     return nullopt;
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 38646701ebd..5178c4b6109 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -258,7 +258,7 @@ PyObject* THPEngine_run_backward(
   for (const auto i : c10::irange(num_tensors)) {
     PyObject* _tensor = PyTuple_GET_ITEM(tensors, i);
     Edge gradient_edge; // Temporary variable to hold the gradient edge
-    c10::optional<at::Tensor> mb_output;
+    std::optional<at::Tensor> mb_output;
     if (THPVariable_Check(_tensor)) {
       mb_output = THPVariable_Unpack(_tensor);
       TORCH_CHECK(
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index 567315f77ef..d18de830ff7 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -10,7 +10,7 @@
 
 namespace c10d {
 
-// callback function will be given arguments (optional<string> oldValue,
+// callback function will be given arguments (std::optional<string> oldValue,
 // std::optional<string> newValue)
 using WatchKeyCallback =
     std::function<void(std::optional<std::string>, std::optional<std::string>)>;
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
index adca78ea7a8..db891c7b5f1 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
@@ -75,7 +75,7 @@ std::vector<at::Tensor> unpack_tensors(
       unpack_optional_tensor_list_ivalue(ivalue, device, inputs);
     } else if (
         *ivalue_arg.real_type() ==
-        *c10::getTypePtr<c10::optional<at::Tensor>>()) {
+        *c10::getTypePtr<std::optional<at::Tensor>>()) {
       // ivalue is c10::optional<at::Tensor>
       unpack_optional_tensor_ivalue(ivalue, device, inputs);
     }
diff --git a/torch/csrc/jit/mobile/train/random.cpp b/torch/csrc/jit/mobile/train/random.cpp
index b11d8e9dee9..029db563b71 100644
--- a/torch/csrc/jit/mobile/train/random.cpp
+++ b/torch/csrc/jit/mobile/train/random.cpp
@@ -14,7 +14,7 @@ RandomSampler::RandomSampler(int64_t size, Dtype index_dtype)
 
 RandomSampler::~RandomSampler() = default;
 
-void RandomSampler::reset(optional<size_t> new_size) {
+void RandomSampler::reset(std::optional<size_t> new_size) {
   // This allocates a new chunk of memory every time (just FYI). It should be
   // amortized over the entire epoch hopefully.
   const auto size = new_size.value_or(static_cast<size_t>(indices_.numel()));
@@ -22,7 +22,7 @@ void RandomSampler::reset(optional<size_t> new_size) {
   index_ = 0;
 }
 
-optional<std::vector<size_t>> RandomSampler::next(size_t batch_size) {
+std::optional<std::vector<size_t>> RandomSampler::next(size_t batch_size) {
   AT_ASSERT(index_ <= indices_.numel());
   const size_t remaining_indices = indices_.numel() - index_;
   if (remaining_indices == 0) {
diff --git a/torch/csrc/jit/mobile/train/random.h b/torch/csrc/jit/mobile/train/random.h
index de256afdffc..f2cb38ee0bc 100644
--- a/torch/csrc/jit/mobile/train/random.h
+++ b/torch/csrc/jit/mobile/train/random.h
@@ -32,7 +32,7 @@ class TORCH_API RandomSampler : public torch::data::samplers::Sampler<> {
   ~RandomSampler() override;
 
   /// Resets the `RandomSampler` to a new set of indices.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns the next batch of indices.
   std::optional<std::vector<size_t>> next(size_t batch_size) override;
diff --git a/torch/csrc/jit/mobile/train/sequential.cpp b/torch/csrc/jit/mobile/train/sequential.cpp
index f1d854e8095..6292d7f138b 100644
--- a/torch/csrc/jit/mobile/train/sequential.cpp
+++ b/torch/csrc/jit/mobile/train/sequential.cpp
@@ -10,7 +10,7 @@ namespace jit {
 namespace mobile {
 SequentialSampler::SequentialSampler(size_t size) : size_(size) {}
 
-void SequentialSampler::reset(optional<size_t> new_size) {
+void SequentialSampler::reset(std::optional<size_t> new_size) {
   if (new_size.has_value()) {
     size_ = *new_size;
   }
diff --git a/torch/csrc/jit/mobile/train/sequential.h b/torch/csrc/jit/mobile/train/sequential.h
index 9264dd4c038..59450c9e35f 100644
--- a/torch/csrc/jit/mobile/train/sequential.h
+++ b/torch/csrc/jit/mobile/train/sequential.h
@@ -27,7 +27,7 @@ class TORCH_API SequentialSampler : public torch::data::samplers::Sampler<> {
   explicit SequentialSampler(size_t size);
 
   /// Resets the `SequentialSampler` to zero.
-  void reset(optional<size_t> new_size = std::nullopt) override;
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
 
   /// Returns the next batch of indices.
   std::optional<std::vector<size_t>> next(size_t batch_size) override;
diff --git a/torch/csrc/profiler/unwind/debug_info.h b/torch/csrc/profiler/unwind/debug_info.h
index 38c613f4f8e..067d7dc2e83 100644
--- a/torch/csrc/profiler/unwind/debug_info.h
+++ b/torch/csrc/profiler/unwind/debug_info.h
@@ -16,7 +16,7 @@ struct DebugInfo {
     auto L = parseHeader(offset);
     parseCompileUnit(L);
   }
-  unwind::optional<uint64_t> lineNumberProgramOffset() {
+  std::optional<uint64_t> lineNumberProgramOffset() {
     return line_number_program_offset_;
   }
   uint64_t nextOffset() {
diff --git a/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h b/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h
index 2c229823027..3fa56b26d55 100644
--- a/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h
+++ b/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h
@@ -88,9 +88,7 @@ enum {
   DW_RLE_start_length = 0x7
 };
 
-static torch::unwind::optional<size_t> formSize(
-    uint64_t form,
-    uint8_t sec_offset_size) {
+static std::optional<size_t> formSize(uint64_t form, uint8_t sec_offset_size) {
   switch (form) {
     case DW_FORM_addr:
       return sizeof(void*);
diff --git a/torch/csrc/profiler/unwind/line_number_program.h b/torch/csrc/profiler/unwind/line_number_program.h
index 0b16fa0c12c..ebebbf1d041 100644
--- a/torch/csrc/profiler/unwind/line_number_program.h
+++ b/torch/csrc/profiler/unwind/line_number_program.h
@@ -147,7 +147,7 @@ struct LineNumberProgram {
     uint32_t file = 1;
     int64_t line = 1;
   };
-  unwind::optional<Entry> find(uint64_t address) {
+  std::optional<Entry> find(uint64_t address) {
     auto e = program_index_.find(address);
     if (!e) {
       return std::nullopt;
diff --git a/torch/csrc/profiler/unwind/range_table.h b/torch/csrc/profiler/unwind/range_table.h
index 08cb4f492fb..b8c405ddad6 100644
--- a/torch/csrc/profiler/unwind/range_table.h
+++ b/torch/csrc/profiler/unwind/range_table.h
@@ -3,7 +3,6 @@
 #include <algorithm>
 #include <memory>
 #include <optional>
-#include <unordered_map>
 #include <vector>
 
 namespace torch::unwind {
@@ -14,7 +13,7 @@ struct RangeTable {
     addresses_.push_back(0);
     payloads_.emplace_back(std::nullopt);
   }
-  void add(uint64_t address, unwind::optional<T> payload, bool sorted) {
+  void add(uint64_t address, std::optional<T> payload, bool sorted) {
     if (addresses_.back() > address) {
       UNWIND_CHECK(!sorted, "expected addresses to be sorted");
       sorted_ = false;
@@ -22,7 +21,7 @@ struct RangeTable {
     addresses_.push_back(address);
     payloads_.emplace_back(std::move(payload));
   }
-  unwind::optional<T> find(uint64_t address) {
+  std::optional<T> find(uint64_t address) {
     maybeSort();
     auto it = std::upper_bound(addresses_.begin(), addresses_.end(), address);
     return payloads_.at(it - addresses_.begin() - 1);
@@ -56,7 +55,7 @@ struct RangeTable {
            bool(payloads_[a]) < bool(payloads_[b]));
     });
     std::vector<uint64_t> addresses;
-    std::vector<unwind::optional<T>> payloads;
+    std::vector<std::optional<T>> payloads;
     addresses.reserve(addresses_.size());
     payloads.reserve(addresses_.size());
     for (auto i : indices) {
@@ -69,6 +68,6 @@ struct RangeTable {
   }
   bool sorted_ = true;
   std::vector<uint64_t> addresses_;
-  std::vector<unwind::optional<T>> payloads_;
+  std::vector<std::optional<T>> payloads_;
 };
 } // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/sections.h b/torch/csrc/profiler/unwind/sections.h
index 2e292b9000b..d9f16b9164d 100644
--- a/torch/csrc/profiler/unwind/sections.h
+++ b/torch/csrc/profiler/unwind/sections.h
@@ -77,7 +77,7 @@ struct Sections {
     return is_64bit ? data.read<uint64_t>() : data.read<uint32_t>();
   }
 
-  unwind::optional<uint64_t> findDebugInfoOffset(uint64_t address) {
+  std::optional<uint64_t> findDebugInfoOffset(uint64_t address) {
     return debug_info_offsets_.find(address);
   }
   size_t compilationUnitCount() {
diff --git a/torch/csrc/profiler/unwind/unwind_error.h b/torch/csrc/profiler/unwind/unwind_error.h
index cca8f8d1218..64812161648 100644
--- a/torch/csrc/profiler/unwind/unwind_error.h
+++ b/torch/csrc/profiler/unwind/unwind_error.h
@@ -26,6 +26,4 @@ struct UnwindError : public std::runtime_error {
 // #define PRINT_LINE_TABLE(...) LOG_INFO(__VA_ARGS__)
 #define PRINT_LINE_TABLE(...)
 
-using std::optional; // NOLINT
-
 } // namespace torch::unwind
diff --git a/torchgen/executorch/api/unboxing.py b/torchgen/executorch/api/unboxing.py
index 86e19cd6320..f206980af44 100644
--- a/torchgen/executorch/api/unboxing.py
+++ b/torchgen/executorch/api/unboxing.py
@@ -126,7 +126,7 @@ class Unboxing:
         )
         return (
             f"""
-    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toOptional<{base_type.cpp_type(strip_ref=True)}>();
+    auto {out_name} = {arg_name}.toOptional<{base_type.cpp_type(strip_ref=True)}>();
             """.split(
                 "\n"
             ),
@@ -146,7 +146,7 @@ class Unboxing:
         if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.Tensor:
             code.extend(
                 f"""
-    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toTensorList();
+    auto {out_name} = {arg_name}.toTensorList();
                 """.split(
                     "\n"
                 )
@@ -156,7 +156,7 @@ class Unboxing:
         ):
             code.extend(
                 f"""
-    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toIntList();
+    auto {out_name} = {arg_name}.toIntList();
                 """.split(
                     "\n"
                 )
@@ -164,7 +164,7 @@ class Unboxing:
         elif isinstance(t.elem, BaseType) and t.elem.name == BaseTy.float:
             code.extend(
                 f"""
-    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toDoubleList();
+    auto {out_name} = {arg_name}.toDoubleList();
                 """.split(
                     "\n"
                 )
@@ -173,7 +173,7 @@ class Unboxing:
             # handle list type with size, e.g., bool[4]
             code.extend(
                 f"""
-    {ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.toBoolList();
+    auto {out_name} = {arg_name}.toBoolList();
                 """.split(
                     "\n"
                 )
@@ -194,7 +194,7 @@ for (auto {elem_name}: {in_name}) {{
     {out_name}.push_back({elem_name});
 }}
 #else
-torch::executor::ArrayRef<torch::executor::optional<torch::executor::Tensor>> {out_name} = {arg_name}.toListOptionalTensor();
+auto {out_name} = {arg_name}.toListOptionalTensor();
 #endif
                 """.split(
                     "\n"
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index dcefd2c268a..913be778461 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -42,8 +42,8 @@ def unwrap_tensor(name: str, cur_level_var: str) -> list[str]:
 
 def unwrap_optional_tensor(name: str, cur_level_var: str) -> list[str]:
     result = f"""\
-    optional<Tensor> {name}_value;
-    optional<int64_t> {name}_bdim;
+    std::optional<Tensor> {name}_value;
+    std::optional<int64_t> {name}_bdim;
     if ({name}) {{
         std::tie({name}_value, {name}_bdim) = unwrapTensorAtLevel({name}.value(), {cur_level_var});
     }}"""