Remove -Wno-unused-variable from utils.cmake (take 2) (#75538)

Summary: [Comment](https://github.com/pytorch/pytorch/pull/62445/files#r680132022) claims, it got added for consistency with top level CMakeLists.txt, but `-Wno-unused-variable` is not mentioned there. Modify violations in 50+ files that were added in the interim by either removing unused variables, or decorating the code with `C10_UNUSED` if local variable is likely used to extend object lifetime until the end of the block. Caused preventable revert in https://github.com/pytorch/pytorch/pull/72633#issuecomment-1092300787 Pull Request resolved: https://github.com/pytorch/pytorch/pull/75538 Reviewed By: anjali411 Differential Revision: D35747333 Pulled By: malfet fbshipit-source-id: 3fc5828e44a4c05ba0e89e92613e6ebbdb260626 (cherry picked from commit c179fba21cfa2a0093fad50ccad5a22dd7cff52c)
2025-12-06 12:20:52 +01:00 · 2022-04-20 10:35:16 -07:00 · 2022-04-20 10:35:16 -07:00 · f6c275f55d
commit f6c275f55d
parent 29b004be7a
53 changed files with 43 additions and 122 deletions
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@ -372,7 +372,7 @@ ivalue::TupleTypeFactory<TupleType>::fallback(const Type& type) {
  for (const auto& elem : dyn.arguments().elems) {
    types.emplace_back(elem.ty);
    if (const auto& name = elem.label) {
-      fields.emplace_back(*elem.label);
+      fields.emplace_back(*name);
    }
  }
  if (const auto& name = dyn.name()) {
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1833,7 +1833,7 @@ DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple_hx_typ

 namespace {

-static auto ensure_linear_params_registered = register_linear_params();
+static C10_UNUSED auto ensure_linear_params_registered = register_linear_params();

 static auto cell_params_base_registry =
    torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
@ -71,7 +71,7 @@ int register_linear_params() {
 }

 namespace {
-static auto linear_params = register_linear_params();
+static C10_UNUSED auto linear_params = register_linear_params();
 }  // namespace

 }}  // namespace ao::sparse
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -554,9 +554,9 @@ int register_embedding_params() {

 namespace {

-static auto conv2d_params = register_conv_params<2>();
-static auto conv3d_params = register_conv_params<3>();
-static auto linear_params = register_linear_params();
-static auto embedding_params = register_embedding_params();
+static C10_UNUSED auto conv2d_params = register_conv_params<2>();
+static C10_UNUSED auto conv3d_params = register_conv_params<3>();
+static C10_UNUSED auto linear_params = register_linear_params();
+static C10_UNUSED auto embedding_params = register_embedding_params();

 } // namespace
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@ -187,7 +187,6 @@ Descriptor::Set dispatch_prologue(
    const Shader::Descriptor& shader_descriptor,
    const Shader::WorkGroup& local_work_group_size) {
  Context* const context = api::context();
-  const GPU gpu = context->gpu();
  Descriptor& descriptor = context->descriptor();
  Pipeline& pipeline = context->pipeline();
  Shader& shader = context->shader();
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@ -199,7 +199,6 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
 Tensor cat(
  const at::TensorList tensors,
  const int64_t dim) {
-  const auto norm_dim = normalize_dim(dim, 4);
  TORCH_CHECK(
    tensors.size() > 0,
    "Vulkan cat expects at least one tensor");
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@ -348,6 +348,7 @@ _ScopeGuard<T> ScopeGuard(T f) {
        stats.field.groupName.c_str(),                              \
        __caffe_event_value_,                                       \
        ##__VA_ARGS__);                                             \
+    (void)__caffe_event_value_;                                     \
  }

 #define CAFFE_DURATION(stats, field, ...)                        \
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@ -286,9 +286,6 @@ NO_GRADIENT(BooleanMaskLengths);

 } // namespace

-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-const float minf = -1.0f * std::numeric_limits<float>::infinity();
-
 // Template this on a functor object so we can generate different
 // implementations at compile time and have a better chance of inlining
 template <typename Functor>
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@ -155,7 +155,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
    col_buffer->Resize(buffer_shape);
    T* col_buffer_data = col_buffer->template mutable_data<T>();
    // Im2col, followed by gemm.
-    for (const auto image_id : c10::irange(N)) {
+    for (C10_UNUSED const auto image_id : c10::irange(N)) {
      for (const auto group_id : c10::irange(group_)) {
        DeformableIm2col(
            Xdata + group_id * input_offset,
@ -342,7 +342,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
    math::Set<T, Context>(dX->numel(), 0, dXdata, &context_);
  }

-  for (const auto image_id : c10::irange(N)) {
+  for (C10_UNUSED const auto image_id : c10::irange(N)) {
    for (const auto group_id : c10::irange(group_)) {
      math::Gemm<T, Context>(
          CblasTrans,
--- a/caffe2/operators/piecewise_linear_transform_op.h
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@ -62,7 +62,7 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
      const int64_t num_bounds_per_group,
      const int64_t num_group) {
    const T* start = bounds;
-    for (const auto i : c10::irange(num_group)) {
+    for (C10_UNUSED const auto i : c10::irange(num_group)) {
      if (!std::is_sorted(start, start + num_bounds_per_group)) {
        return false;
      }
--- a/caffe2/operators/quant_decode_op.h
+++ b/caffe2/operators/quant_decode_op.h
@ -36,7 +36,7 @@ void Decode(
    }

    int sz = output->numel();
-    for (const auto i : c10::irange(sz)) {
+    for (C10_UNUSED const auto i : c10::irange(sz)) {
      DCHECK_LE(*code_ptr, cb_size);
      *out_ptr++ = cb_ptr[*code_ptr++];
    }
--- a/caffe2/operators/quantized/int8_roi_align_op.h
+++ b/caffe2/operators/quantized/int8_roi_align_op.h
@ -229,8 +229,8 @@ void ROIAlignForward(
      for (const auto pw : c10::irange(pooled_width)) {
        vector<int32_t> acc_buffer(channels, 0);

-        for (const auto iy : c10::irange(roi_bin_grid_h)) {
-          for (const auto ix : c10::irange(roi_bin_grid_w)) {
+        for (C10_UNUSED const auto iy : c10::irange(roi_bin_grid_h)) {
+          for (C10_UNUSED const auto ix : c10::irange(roi_bin_grid_w)) {
            PreCalc pc = pre_calc[pre_calc_index];

            const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1;
--- a/caffe2/quantization/server/im2col_dnnlowp.h
+++ b/caffe2/quantization/server/im2col_dnnlowp.h
@ -216,7 +216,7 @@ static void Im2ColNHWC(
    T* data_col_temp =
        data_col + h * width_col * kernel_h * kernel_w * channels;
    int w_pad = -pad_l;
-    for (const auto w : c10::irange(width_col)) {
+    for (C10_UNUSED const auto w : c10::irange(width_col)) {
      int r = 0;
      for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
        int s = 0;
--- a/caffe2/queue/blobs_queue.cc
+++ b/caffe2/queue/blobs_queue.cc
@ -18,16 +18,11 @@
 namespace caffe2 {

 // Constants for user tracepoints
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr int SDT_NONBLOCKING_OP = 0;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr int SDT_BLOCKING_OP = 1;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
-// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
-static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;
+C10_UNUSED static constexpr int SDT_NONBLOCKING_OP = 0;
+C10_UNUSED static constexpr int SDT_BLOCKING_OP = 1;
+C10_UNUSED static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
+C10_UNUSED static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
+C10_UNUSED static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;

 BlobsQueue::BlobsQueue(
    Workspace* ws,
@ -66,8 +61,7 @@ bool BlobsQueue::blockingRead(
    float timeout_secs) {
  Timer readTimer;
  auto keeper = this->shared_from_this();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
  CAFFE_SDT(queue_read_start, name, (void*)this, SDT_BLOCKING_OP);
  std::unique_lock<std::mutex> g(mutex_);
  auto canRead = [this]() {
@ -76,7 +70,6 @@ bool BlobsQueue::blockingRead(
  };
  // Decrease queue balance before reading to indicate queue read pressure
  // is being increased (-ve queue balance indicates more reads than writes)
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, queue_balance, -1);
  if (timeout_secs > 0) {
    std::chrono::milliseconds timeout_ms(int(timeout_secs * 1000));
@ -99,17 +92,14 @@ bool BlobsQueue::blockingRead(
  CAFFE_ENFORCE(inputs.size() >= result.size());
  for (const auto i : c10::irange(result.size())) {
    auto bytes = BlobStat::sizeBytes(*result[i]);
-    // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
    CAFFE_EVENT(stats_, queue_dequeued_bytes, bytes, i);
    using std::swap;
    swap(*(inputs[i]), *(result[i]));
  }
  CAFFE_SDT(queue_read_end, name, (void*)this, writer_ - reader_);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, queue_dequeued_records);
  ++reader_;
  cv_.notify_all();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, read_time_ns, readTimer.NanoSeconds());
  return true;
 }
@ -117,8 +107,7 @@ bool BlobsQueue::blockingRead(
 bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
  Timer writeTimer;
  auto keeper = this->shared_from_this();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
  CAFFE_SDT(queue_write_start, name, (void*)this, SDT_NONBLOCKING_OP);
  std::unique_lock<std::mutex> g(mutex_);
  if (!canWrite()) {
@ -127,11 +116,9 @@ bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
  }
  // Increase queue balance before writing to indicate queue write pressure is
  // being increased (+ve queue balance indicates more writes than reads)
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, queue_balance, 1);
  DCHECK(canWrite());
  doWrite(inputs);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
  return true;
 }
@ -139,13 +126,11 @@ bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
 bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
  Timer writeTimer;
  auto keeper = this->shared_from_this();
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
  CAFFE_SDT(queue_write_start, name, (void*)this, SDT_BLOCKING_OP);
  std::unique_lock<std::mutex> g(mutex_);
  // Increase queue balance before writing to indicate queue write pressure is
  // being increased (+ve queue balance indicates more writes than reads)
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, queue_balance, 1);
  cv_.wait(g, [this]() { return closing_ || canWrite(); });
  if (!canWrite()) {
@ -154,7 +139,6 @@ bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
  }
  DCHECK(canWrite());
  doWrite(inputs);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
  CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
  return true;
 }
@ -178,8 +162,7 @@ bool BlobsQueue::canWrite() {
 void BlobsQueue::doWrite(const std::vector<Blob*>& inputs) {
  auto& result = queue_[writer_ % queue_.size()];
  CAFFE_ENFORCE(inputs.size() >= result.size());
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
-  const auto& name = name_.c_str();
+  C10_UNUSED const auto& name = name_.c_str();
  for (const auto i : c10::irange(result.size())) {
    using std::swap;
    swap(*(inputs[i]), *(result[i]));
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -449,7 +449,6 @@ function(torch_compile_options libname)
        -Wall
        -Wextra
        -Wno-unused-parameter
-        -Wno-unused-variable
        -Wno-unused-function
        -Wno-unused-result
        -Wno-unused-local-typedefs
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@ -5051,7 +5051,6 @@ Tensor group_norm_jvp(
 Tensor group_norm_mean_jvp(
    const Tensor& input_t, const Tensor& mean_p, int64_t groups) {
  int64_t N = input_t.size(0);
-  int64_t C = input_t.size(1);
  std::array<int64_t, 3> view_shape = {1, N * groups, N ? -1 : 1};
  auto input_t_reshaped = input_t.view(view_shape);
  return input_t_reshaped.mean({2}, false).view_as(mean_p);
@ -5062,7 +5061,6 @@ Tensor group_norm_invstd_jvp(
    const Tensor& mean_p, const Tensor& invstd_p,
    int64_t groups) {
  int64_t N = input_p.size(0);
-  int64_t C = input_p.size(1);

  std::vector<int64_t> view_shape = {1, N * groups, N ? -1 : 1};

--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -328,7 +328,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
    // one uint64_t variable as key.
    std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
        tidSeq2activity;
-    uint64_t fwd_bwd_link_id = 1;

    for (const auto idx : c10::irange(cpu_trace->activities.size())) {
      auto& kineto_event = kineto_events_[idx];
@ -603,7 +602,6 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
            if (!state_ptr) {
              return nullptr;
            }
-            const auto& config = state_ptr->config();
            auto corr_id = next_correlation_id();
            torch::profiler::impl::kineto::pushCorrelationId(corr_id);
            return state_ptr->record_queue_.getSubqueue()->begin_op(fn, corr_id);
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@ -2836,8 +2836,9 @@ void ProcessGroupGloo::monitoredBarrier(

  waitLoop(sendWorkMap);

-  auto elapsedTime = std::chrono::duration_cast<std::chrono::milliseconds>(
-      std::chrono::steady_clock::now() - startTime);
+  using namespace std::chrono;
+  C10_UNUSED auto elapsedTime = duration_cast<milliseconds>(
+      steady_clock::now() - startTime);
 }

 void ProcessGroupGloo::setSequenceNumberForGroup() {
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -31,8 +31,6 @@ constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM";

 namespace {

-constexpr int kBytes = 8;
-
 // RAII helper class to manage NCCL group API and CUDA free mutex.
 // The destructor is allowed to throw since this helper class only
 // manages group and lock lifetimes.
@ -440,10 +438,6 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(

  // In case of blocking, wait for the operation to complete.
  if (blockingWait_) {
-    // Use the passed in timeout if provided, otherwise use the default
-    // opTimeout for each WorkNCCL object.
-    std::chrono::milliseconds workTimeout =
-        timeout == kNoTimeout ? opTimeout_ : timeout;
    // Wait for the operation to complete.
    while (!isCompleted()) {
      if (timedOut()) {
--- a/torch/csrc/jit/codegen/cuda/arith.cpp
+++ b/torch/csrc/jit/codegen/cuda/arith.cpp
@ -1492,7 +1492,6 @@ TensorView* gather(
        ". Padding right: ",
        pad_right);
    const auto out_stop_offset = inp_stop_offset.value() + extent_adjustment;
-    Val* out_axis_dim = nullptr;
    out_root_domains.push_back(IrBuilder::create<IterDomain>(
        FusionGuard::getCurFusion()->zeroVal(),
        inp_axis->extent(),
--- a/torch/csrc/jit/codegen/cuda/codegen.cpp
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@ -938,14 +938,13 @@ class CudaKernelGenerator : private OptOutConstDispatch {

    indent() << genMmaOp(mma, true) << "(reinterpret_cast<Array<"
             << mma->out()->getDataType().value() << ","
-             << getOutputRegisterSize(mma->options().macro) << ","
-             << getOutputRegisterSize(mma->options().macro) << ">*>"
+             << getOutputRegisterSize(options.macro) << ","
+             << getOutputRegisterSize(options.macro) << ">*>"
             << "(&" << gen(uop->out()) << "));\n";
  }

  void handle(const MmaOp* mma) final {
    auto options = mma->options();
-    auto in_a = mma->inA()->as<kir::TensorIndex>();
    auto out = mma->out()->as<kir::TensorIndex>();
    indent() << genMmaOp(mma) << "(\n";
    indent() << kTab << "reinterpret_cast<Array<"
@ -967,7 +966,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {

  void handle(const BroadcastOp* stmt) final {
    TORCH_INTERNAL_ASSERT(stmt->out()->isA<kir::TensorIndex>());
-    const auto tensor_index = stmt->out()->as<kir::TensorIndex>();

    const ParallelTypeBitmap parallel_types =
        kernel_->summary().broadcast_parallel_types.at(stmt);
@ -1313,7 +1311,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
    TORCH_INTERNAL_ASSERT(rop->isFused());

    const auto out = rop->out()->as<kir::TensorIndex>();
-    const auto domain = out->view()->domain();

    const auto data_type = rop->out()->dtype();
    const auto op_type = rop->getReductionOpType();
@ -1384,11 +1381,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
        parallel_types.hasBID(),
        "GridBroadcast needs to be used with a broadcast op that is parallelized with the BID parallel types");

-    const auto out = bop->out()->as<kir::TensorIndex>();
-    const auto domain = out->view()->domain();
-
-    const auto data_type = bop->out()->dtype();
-
    TORCH_INTERNAL_ASSERT(
        grop->broadcast_buffer()->buffer()->isA<TensorView>());
    TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA<TensorView>());
@ -1499,7 +1491,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
    TORCH_INTERNAL_ASSERT(wop->isFused());

    const auto out = wop->out()->as<kir::TensorIndex>();
-    const auto domain = out->view()->domain();

    const auto data_type = wop->outAvg()->dtype();
    const auto index_type = wop->outN()->dtype();
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@ -548,7 +548,7 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
  FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals");
  GlobalBuffers global_buffers;
  const auto kernel = lowered_->kernel();
-  const auto& kernel_summary = lowered_->kernel()->summary();
+  const auto& kernel_summary = kernel->summary();
  for (auto alloc : kernel_summary.global_allocations) {
    TORCH_INTERNAL_ASSERT(
        alloc->buffer()->isA<TensorView>(),
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@ -611,9 +611,6 @@ void validateAlignedVectorizedTensors(

  // Verify extents of aligned vectorized tensors
  for (const auto& vec_info : kernel->summary().vectorized_set_info) {
-    auto in_tv = vec_info.producer_tv;
-    auto out_tv = vec_info.consumer_tv;
-
    if (vec_info.vectorized_leaf_id->getParallelType() ==
        ParallelType::Vectorize) {
      validateAlignedVectorizeExtents(vec_info, expr_eval);
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@ -2303,7 +2303,7 @@ void separateNestedViews(Node* cuda_fusion_group) {
      auto parent = parent_value->node();

      auto grandparent_value = parent->input(0);
-      auto grandparent = grandparent_value->node();
+      C10_UNUSED auto grandparent = grandparent_value->node();

      // Before: gp -> x -> n
      // After: gp -> x / gp -> n
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@ -814,8 +814,6 @@ indexMapFromTV(
    kir::ForLoop* alloc_loop,
    bool as_consumer,
    kir::ForLoop* double_buffer_loop = nullptr) {
-  const auto gpu_lower = GpuLower::current();
-
  bool within_alloc = false;
  if (alloc_loop == nullptr) {
    within_alloc = true;
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@ -142,7 +142,7 @@ void Scope::insert(size_t pos, Expr* expr) {

 void Scope::erase(std::vector<Expr*>::const_iterator pos) {
  // Remove the scope of the expr if this is the scope
-  auto expr = *pos;
+  C10_UNUSED auto expr = *pos;
  exprs_.erase(pos);
 }

--- a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
@ -58,9 +58,6 @@ class AllocationInserter : public kir::ExprMutator {
  // Fills info.buffer, info.alloc_pos, info.init_for_loop,
  // info.init_place_before, info.alloc_for_loop, info.alloc_place_before
  void fillAllocationInformation(AllocationInformation& info, Expr* expr) {
-    size_t alloc_pos = 0;
-    kir::ForLoop* init_for_loop = nullptr;
-    size_t fl_idx_next = 0;
    auto loop_alloc_info =
        loop_utils::getAllocInformation(info.buffer, for_loops_);

--- a/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp
@ -215,8 +215,6 @@ class DoubleBufferLoopCloner : public kir::IrVisitor {
  }

  void handle(kir::ForLoop* fl) final {
-    const auto gpu_lower = GpuLower::current();
-
    kir::ForLoop* cloned_loop = fl == double_buffer_loop_
        ? cloned_top_level_loop_
        : IrBuilder::create<kir::ForLoop>(fl);
--- a/torch/csrc/jit/codegen/cuda/lower_shift.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_shift.cpp
@ -326,8 +326,6 @@ void HaloInfo::insertToInheritanceMap(
 void HaloInfo::initializeFromRootAxisInfo(IterDomain* id) {
  TORCH_INTERNAL_ASSERT(hasRootAxisInfo(id));

-  auto gpu_lower = GpuLower::current();
-
  const auto& halo_info = getRootAxisInfo(id);
  auto halo_width = halo_info.width();

@ -350,8 +348,6 @@ void HaloInfo::setHaloWidth(IterDomain* id, int halo_width) {

 // Propagate extent information from root axes to descendants
 void HaloInfo::build(TensorDomain* td) {
-  auto gpu_lower = GpuLower::current();
-
  auto exprs = DependencyCheck::getAllExprsBetween(
      {td->getMaybeRFactorDomain().begin(), td->getMaybeRFactorDomain().end()},
      {td->domain().begin(), td->domain().end()});
--- a/torch/csrc/jit/codegen/cuda/mutator.cpp
+++ b/torch/csrc/jit/codegen/cuda/mutator.cpp
@ -251,7 +251,7 @@ void OptOutMutator::mutate(MmaOp* mma) {
  auto container = mma->container();
  auto options = mma->options();
  container->removeExpr(mma);
-  auto new_mma =
+  C10_UNUSED auto new_mma =
      IrBuilder::create<MmaOp>(container, out, in_a, in_b, init, options);
 }

@ -357,7 +357,7 @@ void OptOutMutator::mutate(Split* s) {
  auto container = s->container();
  auto inner_split = s->innerSplit();
  container->removeExpr(s);
-  auto new_node = IrBuilder::create<Split>(
+  C10_UNUSED auto new_node = IrBuilder::create<Split>(
      container, ot, inr, in, fact, inner_split, start_offset, stop_offset);
 }

@ -373,7 +373,7 @@ void OptOutMutator::mutate(Merge* m) {

  auto container = m->container();
  container->removeExpr(m);
-  auto new_node = IrBuilder::create<Merge>(container, ot, otr, in);
+  C10_UNUSED auto new_node = IrBuilder::create<Merge>(container, ot, otr, in);
 }

 void OptOutMutator::mutate(kir::Allocate*) {
--- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
@ -93,8 +93,6 @@ void ParallelDimensionMap::populateDimensionMapWithSingleCASet(
    const std::unordered_set<IterDomain*>& dom_set) {
  TORCH_INTERNAL_ASSERT(dom_set.size() == 1);

-  const auto gpu_lower = GpuLower::current();
-
  // pt is used by only one concrete domain
  auto id = *dom_set.begin();
  auto it = constant_extent_map_.find(id);
@ -119,8 +117,6 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet(
    const std::unordered_set<IterDomain*>& dom_set) {
  TORCH_INTERNAL_ASSERT(dom_set.size() > 1);

-  const auto gpu_lower = GpuLower::current();
-
  bool all_equal = true;
  // Use nullptr to signal it's not initialied yet
  Val* known_dimension = nullptr;
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@ -1369,8 +1369,6 @@ class IrParser {
        REGISTER_PARSE_RULE(
            ptr_op,
            {
-              auto fusion = FusionGuard::getCurFusion();
-
              // TODO: handle channels last
              MemoryFormat format;
              std::list<Val*> list_val;
--- a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp
@ -8,7 +8,6 @@ namespace fuser {
 namespace cuda {

 void PartialSplitMap::build(Fusion* fusion) {
-  const auto gpu_lower = GpuLower::current();
  auto used_vals = ir_utils::allTvs(fusion);

  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
@ -444,7 +444,6 @@ void UnswitchPredicate::predicateOn(Expr* tv_expr) {

  auto ref_pred_info = Index::getReferenceRootPredicates(
      out_tv, for_loops_, unrolled_loop_, false);
-  const ReferenceTensor& reference = ref_pred_info.second;

  // If RootPredicateInfo has a static predicate that is more
  // restrictive than the current one, replace the current with the
--- a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
@ -109,7 +109,6 @@ bool canValidateIsInnerDim(
    } else if (auto merge = dynamic_cast<Merge*>(expr)) {
      // Might consider just rejecting merge.
      auto outer = merge->outer();
-      auto inner = merge->inner();
      if (outer->isBroadcast()) {
        return false;
      }
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
@ -808,7 +808,6 @@ PersistentBufferSizeReturn persistentBufferSize(
  std::vector<bool> persistent_mask(all_buffers.size(), false);

  for (auto buffer_i : c10::irange(persistent_buffers.size())) {
-    auto buffer = all_buffers[buffer_i];
    persistent_mask[buffer_i] = true;
  }

@ -855,7 +854,6 @@ PersistentBufferSizeReturn persistentBufferSize(
  int64_t max_persistence_size = 0;
  int64_t max_proj_persistence_size = 0;
  for (const auto& entry : scoped_persistence_factor) {
-    auto val = entry.first;
    auto active_buffers = entry.second;
    auto persistent_buffer_size = masked_dot_product(
        persistent_mask, active_buffers, persistent_buffer_sizes);
--- a/torch/csrc/jit/codegen/cuda/type_inference.cpp
+++ b/torch/csrc/jit/codegen/cuda/type_inference.cpp
@ -254,7 +254,6 @@ class NaiveTypePropagator {
      }
      case aten::_batch_norm_impl_index_backward:
      case aten::native_batch_norm_backward: {
-        int grad_input_index = 1;
        int weight_index = -1;
        int mask_index = -1;
        if (node->kind() ==
@ -486,7 +485,6 @@ class NaiveTypePropagator {
        TORCH_CHECK(
            hasTypeAndDevice(in_type),
            "Type and device propagation has failed, or was not provided enough information.");
-        const auto in_scalar_type = in_type->scalarType();
        const auto in_device = in_type->device();
        const auto cuda_enabled = constant_as<bool>(node->input(1));
        const auto cpu_enabled = constant_as<bool>(node->input(2));
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@ -5543,7 +5543,6 @@ std::vector<Function*> CompilationUnit::define(

 void eraseListLiterals(std::shared_ptr<Graph>& graph) {
  DepthFirstGraphNodeIterator it(graph);
-  Node* n = nullptr;

  for (auto next_node = it.next(); next_node != nullptr;) {
    Node* node = next_node;
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@ -543,6 +543,7 @@ void IRParser::parse() {
    TORCH_INTERNAL_ASSERT(dtype);
    auto options = at::TensorOptions(*device).dtype(*dtype);
    auto t = n->t_(attr::value, at::empty_strided(*sizes, *strides, options));
+    (void)t;
  }
 }

--- a/torch/csrc/jit/mobile/nnc/context.cpp
+++ b/torch/csrc/jit/mobile/nnc/context.cpp
@ -264,8 +264,7 @@ c10::impl::GenericList Function::run(
    const c10::IValue& input = inputs[i];
    const auto& spec = input_specs_[i];
    const auto& input_tensor = input.toTensor();
-    TORCH_CHECK(
-        input_specs_[i].validate(input_tensor), "Invalid input at pos: ", i);
+    TORCH_CHECK(spec.validate(input_tensor), "Invalid input at pos: ", i);
    args[i] = input_tensor.data_ptr();
  }
  offset += inputs.size();
--- a/torch/csrc/jit/passes/replacement_of_old_operators.cpp
+++ b/torch/csrc/jit/passes/replacement_of_old_operators.cpp
@ -42,8 +42,7 @@ struct OldOpsReplacerWithUpgraders {
            get_operator_version_map().find(schema_name.value());
        if (version_entry != get_operator_version_map().end()) {
          const auto& entry = version_entry->second;
-          auto upgrader_entry =
-              findUpgrader(version_entry->second, current_version);
+          auto upgrader_entry = findUpgrader(entry, current_version);
          if (!upgrader_entry.has_value()) {
            if (!isOpSymbolCurrent(schema_name.value(), current_version)) {
              TORCH_INTERNAL_ASSERT(
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@ -619,7 +619,7 @@ std::vector<SSArgument> getNodeInputShapes(Node* n, const AliasDb& db) {
  for (size_t node_index = 0; node_index < n->inputs().size(); ++node_index) {
    auto type = n->input(node_index)->type();

-    if (auto tt = type->castRaw<TensorType>()) {
+    if (type->castRaw<TensorType>()) {
      input_shapes.push_back(tensorShapeArg(n->input(node_index)));
      continue;
    }
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@ -92,7 +92,7 @@ void RunDecompositions(Block* block) {

 void RunDecompositions(std::shared_ptr<Graph> g) {
  RunDecompositions(g->block());
-  for (const auto _ : c10::irange(2)) {
+  for (C10_UNUSED const auto _ : c10::irange(2)) {
    PeepholeOptimize(g, /*disable_shape_peephole*/ true);
    ConstantPropagation(g);
  }
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@ -120,7 +120,6 @@ FusionStrategy setFusionStrategy(FusionStrategy& strategy) {
 }

 static std::atomic<size_t> num_profiled_runs{kDefaultNumProfiledRuns};
-static std::atomic<size_t> bailout_depth{kDefaultBailoutDepth};

 std::atomic<bool>& getProfilingMode() {
  return profiling_mode;
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@ -92,7 +92,6 @@ bool isUnsupportedOp(Node* node) {
 bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
  // check for sub-blocks
  bool can_support = true;
-  bool has_blocks = false;
  for (auto* node : graph->block()->nodes()) {
    const auto kind = node->kind();
    if (kind == prim::Constant) {
--- a/torch/csrc/jit/runtime/static/memory_planner.cpp
+++ b/torch/csrc/jit/runtime/static/memory_planner.cpp
@ -407,7 +407,6 @@ void StandardMemoryPlanner::deallocateManagedTensors() {
  for (auto& ms : managed_tensors_) {
    const auto& tensors = ms.group();
    size_t max = ms.maxTensorSize();
-    auto tensor_idx = 0;
    for (auto& tensor : tensors) {
      const auto& storage = tensor->storage();
      size_t current_size = compute_aligned_tensor_size(storage.nbytes());
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@ -149,7 +149,6 @@ std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
    }

    auto start = std::get<0>(buf_ranges.at(buf));
-    auto end = std::get<1>(buf_ranges.at(buf));

    // Release memory for buffers whose liveness range ends before the creation
    // time of this buf.
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@ -1281,7 +1281,6 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
  }
  auto const_tensor = toIValue(v)->toTensor();
  auto scalar_type = c10::typeMetaToScalarType(const_tensor.options().dtype());
-  const auto& tt = v->type()->expect<TensorType>();
  auto sizes = const_tensor.sizes();
  std::vector<ExprHandle> te_sizes;
  te_sizes.reserve(sizes.size());
--- a/torch/csrc/jit/tensorexpr/lowerings.cpp
+++ b/torch/csrc/jit/tensorexpr/lowerings.cpp
@ -1743,8 +1743,7 @@ int nnc_lowerings_lazy_registration() {
 } // namespace

 NNCLoweringFunction getStandardLoweringFor(const std::string& schema_str) {
-  // NOLINTNEXTLINE
-  static const int once = nnc_lowerings_lazy_registration();
+  C10_UNUSED static const int once = nnc_lowerings_lazy_registration();
  const auto& lowerings = getNNCLoweringRegistry();
  if (auto l = lowerings.find(parseSchema(schema_str))) {
    return *l;
--- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
@ -259,8 +259,6 @@ Tensor computeDequantizeExternalCall(
  }

  const BufHandle& qx = c10::get<BufHandle>(inputs[0]);
-  const double qscale = immQScale(qx);
-  const int64_t qzero = immQZero(qx);
  const int64_t qdtype = (int64_t)immQDType(qx);

  BufHandle ResultBuf("dequantize", outputShape, dtype);
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@ -168,7 +168,7 @@ torch::lazy::BackendDataPtr TSBackendImpl::CreateDataPlaceholder(
 std::vector<torch::lazy::ComputationPtr> TSBackendImpl::Compile(
    std::vector<torch::lazy::ComputationPtr> instances) const {
  for (const auto& instance : instances) {
-    auto ts_computation =
+    C10_UNUSED auto ts_computation =
        static_cast<torch::lazy::TSComputation*>(instance.get());
  }
  return instances;
--- a/torch/csrc/profiler/nvtx_observer.cpp
+++ b/torch/csrc/profiler/nvtx_observer.cpp
@ -72,7 +72,6 @@ std::list<std::pair<at::RecordFunctionHandle, int>> flattenOpIdList(c10::List<c1
 }

 std::list<std::pair<at::RecordFunctionHandle, int>> getInputTensorOpIds(const at::RecordFunction& fn) {
-  int num_inputs = fn.inputs().size();
  std::pair<at::RecordFunctionHandle, int> undefined_op_pair(0,-1);
  std::list<std::pair<at::RecordFunctionHandle, int>> input_producer_ops_;
  auto state_ptr = NVTXThreadLocalState::getTLS();
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@ -35,7 +35,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
 ApproximateClockToUnixTimeConverter::time_pairs
    ApproximateClockToUnixTimeConverter::measurePairs() {
  static constexpr auto n_warmup = 5;
-  for (const auto _ : c10::irange(n_warmup)) {
+  for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
    getApproximateTime();
    steady_clock_t::now();
  }