Revert D31326599: [pytorch][PR] Compile without -Wno-unused-variable

Test Plan: revert-hammer Differential Revision: D31326599 (a6280ab653) Original commit changeset: 924155f1257a fbshipit-source-id: b8ee5bc0298637443232f5ee9ec79e51ed256faf
2025-12-06 12:20:52 +01:00 · 2021-10-01 20:39:14 -07:00 · 2021-10-01 20:39:14 -07:00 · e4ee5ca698
commit e4ee5ca698
parent 5ef350d7cc
56 changed files with 94 additions and 73 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -744,6 +744,7 @@ if(NOT MSVC)
  string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-pragmas")
  string(APPEND CMAKE_CXX_FLAGS " -Wno-sign-compare")
  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-parameter")
+  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable")
  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-function")
  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-result")
  string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-local-typedefs")
--- a/aten/src/ATen/mkl/Limits.h
+++ b/aten/src/ATen/mkl/Limits.h
@ -6,6 +6,6 @@ namespace at { namespace native {

  // Since size of MKL_LONG varies on different platforms (linux 64 bit, windows
  // 32 bit), we need to programmatically calculate the max.
-  constexpr int64_t MKL_LONG_MAX = ((1LL << (sizeof(MKL_LONG) * 8 - 2)) - 1) * 2 + 1;
+  static int64_t MKL_LONG_MAX = ((1LL << (sizeof(MKL_LONG) * 8 - 2)) - 1) * 2 + 1;

 }}  // namespace
--- a/aten/src/ATen/test/cuda_stream_test.cpp
+++ b/aten/src/ATen/test/cuda_stream_test.cpp
@ -173,7 +173,6 @@ TEST(TestStream, StreamPoolTest) {
  if (!at::cuda::is_available()) return;
  std::vector<at::cuda::CUDAStream> streams{};
  for (const auto i : c10::irange(200)) {
-    (void)i;
    streams.emplace_back(at::cuda::getStreamFromPool());
  }

--- a/aten/src/ATen/test/cuda_tensor_interop_test.cpp
+++ b/aten/src/ATen/test/cuda_tensor_interop_test.cpp
@ -84,8 +84,6 @@ TEST(CUDAPytorchToCaffe2, Op) {

  auto* c2_tensor_a = BlobSetTensor(workspace.CreateBlob("a"), caffe2::Tensor(at_tensor_a));
  auto* c2_tensor_b = BlobSetTensor(workspace.CreateBlob("b"), caffe2::Tensor(at_tensor_b));
-  (void)c2_tensor_a;
-  (void)c2_tensor_b;

  // Test Alias
  {
--- a/aten/src/ATen/test/math_kernel_test.cpp
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@ -54,6 +54,8 @@ TEST(MathKernelTest, NativeGroupNorm) {
 TEST(MathKernelTest, NativeLayerNorm) {
  const auto input = rand({20, 10, 10, 10});
  const auto input_shape = input.sizes();
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  const auto input_ndim = input.dim();

  double eps = 1e-05;
  for (bool undef_weight: {true, false}) {
--- a/benchmarks/cpp/nvfuser/CMakeLists.txt
+++ b/benchmarks/cpp/nvfuser/CMakeLists.txt
@ -15,8 +15,4 @@ if(USE_CUDA)
    main.cpp)

  target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark)
-  if(NOT MSVC)
-    target_compile_options(nvfuser_bench PRIVATE -Wno-unused-variable)
-  endif()
-
 endif()
--- a/c10/test/CMakeLists.txt
+++ b/c10/test/CMakeLists.txt
@ -6,9 +6,6 @@ if(BUILD_TEST)
    get_filename_component(test_file_name ${test_src} NAME_WE)
    set(test_name "c10_${test_file_name}")
    add_executable(${test_name} "${test_src}")
-    if(NOT MSVC)
-      target_compile_options(${test_name} PRIVATE -Wno-unused-variable)
-    endif()
    target_link_libraries(${test_name} c10 gmock gtest gtest_main)
    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
    if(INSTALL_TEST)
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1762,9 +1762,6 @@ if(BUILD_TEST)
    target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
    target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
    target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-    if(NOT MSVC)
-      target_compile_options(${test_name} PRIVATE -Wno-unused-variable)
-    endif()
    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
    if(INSTALL_TEST)
      install(TARGETS ${test_name} DESTINATION test)
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@ -399,8 +399,8 @@ void TensorSerializer::SerializeWithOptions(
  std::vector<std::future<void>> futures;
  if (tensor.numel() > chunk_size) {
    futures.reserve(FLAGS_caffe2_max_tensor_serializer_threads);
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
    for (const auto i : c10::irange(FLAGS_caffe2_max_tensor_serializer_threads)) {
-      (void)i;
      futures.emplace_back(std::async(std::launch::async, task));
    }
  }
--- a/caffe2/operators/cast_op.h
+++ b/caffe2/operators/cast_op.h
@ -19,6 +19,7 @@ class CastOp : public Operator<Context> {
      : Operator<Context>(operator_def, ws) {
    const ArgumentHelper helper(operator_def);
    TensorProto_DataType to = cast::GetCastDataType(helper, "to");
+    TensorProto_DataType from = cast::GetCastDataType(helper, "from_type");

    SetBody(to);
  }
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -576,9 +576,7 @@ bool CudnnConvOp::DoRunWithType() {
    return true;
  }

-#if !CUDNN_VERSION_MIN(7, 0, 0)
  int group_offset_filter = filter.numel() / group_;
-#endif

  // Set up the cudnn algorithms & workspace if necessary
  bool input_changed = (X.sizes() != cudnn_input_dims_);
@ -953,9 +951,7 @@ bool CudnnConvGradientOp::DoRunWithType() {
      "If you set group, the number of output channels should be divisible "
      "by group.");

-#if !CUDNN_VERSION_MIN(7, 0, 0)
  int group_offset_filter = filter.numel() / group_;
-#endif
  if (kernel_.size() == 1) {
    ConvPoolOpBase<CUDAContext>::ComputePads({H});
  } else if (kernel_.size() == 2) {
--- a/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.cc
+++ b/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.cc
@ -25,10 +25,11 @@ float compress_uniform_simplified_(
  float inverse_scale = 1.0f / scale;

  float norm = 0.0f;
+  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+  constexpr int VLEN = 8;
  int i = 0;

 #ifdef __AVX__
-  constexpr int VLEN = 8;
  // vectorized loop
  __m256 norm_v = _mm256_setzero_ps();
  for (; i < N / VLEN * VLEN; i += VLEN) {
--- a/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h
+++ b/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h
@ -72,6 +72,7 @@ class FloatToFusedNBitFakeRowwiseQuantizedOp final
      CAFFE_THROW("Unsupported data type");
    }

+    bool use_openmp = GREEDY;
 #ifdef _OPENMP
    vector<float> tmp_vec(input_columns * (GREEDY ? omp_get_max_threads() : 1));
 #else
--- a/caffe2/operators/gather_fused_8bit_rowwise_op.h
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h
@ -30,6 +30,7 @@ class GatherFused8BitRowwiseOp : public Operator<Context> {
    const std::vector<int64_t> shape = {indices.size(0), data.size(1) - 8};
    auto* output = Output(0, shape, at::dtype<float>());

+    int block_size = shape[1];
    auto block_bytesize = data.size_from_dim(1) * data.dtype().itemsize();
    int N = indices.numel();

--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@ -133,7 +133,8 @@ std::vector<int> soft_nms_cpu_upright(

    // Find proposal with max score among remaining proposals
    int max_pos;
-    GetSubArray(*out_scores, pending).maxCoeff(&max_pos);
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+    auto max_score = GetSubArray(*out_scores, pending).maxCoeff(&max_pos);
    int i = pending[max_pos];
    keep.push_back(i);

@ -634,7 +635,8 @@ std::vector<int> soft_nms_cpu_rotated(

    // Find proposal with max score among remaining proposals
    int max_pos;
-    GetSubArray(*out_scores, pending).maxCoeff(&max_pos);
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+    auto max_score = GetSubArray(*out_scores, pending).maxCoeff(&max_pos);
    int i = pending[max_pos];
    keep.push_back(i);

--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@ -458,6 +458,8 @@ bool HuffmanTreeHierarchyOp<T, Context>::RunOnDevice() {
  std::vector<int> labelIndices;
  labelIndices.resize(num_classes_);

+  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+  int current_node_index = 0;
  for (int i = 0; i < num_classes_; ++i) {
    Node node(i, labelCounts[i]);
    nodes.push(node);
--- a/caffe2/operators/layer_norm_op.h
+++ b/caffe2/operators/layer_norm_op.h
@ -132,6 +132,8 @@ class LayerNormGradientOp final : public Operator<Context> {
  template <typename T>
  bool DoRunWithType() {
    const auto& dY = Input(0);
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+    const auto& Y = Input(1);
    const auto& mean = Input(2);
    const auto& sigma = Input(3);
    const auto& X = Input(4);
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@ -53,6 +53,7 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
        "the second dim of scale_bias has to be equal to 2");
    CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
    const IndexType* indices = indicesInput.template data<IndexType>();
+    int64_t dataToReduceSize = indicesInput.size(0);

    const int* lengths = lengthsInput.template data<int>();
    vector<int64_t> shape = dataInput.sizes().vec();
--- a/caffe2/operators/local_response_normalization_op_cudnn.cc
+++ b/caffe2/operators/local_response_normalization_op_cudnn.cc
@ -193,6 +193,8 @@ bool CuDNNLRNGradientOp::DoRunWithType() {

 bool CuDNNLRNGradientOp::RunOnDevice() {
  // dispatch based on contents of tensor(s)
+  const auto& X = Input(0);
+  const auto& Y = Input(1);
  const auto& dY = Input(2);
  auto* dX = Output(0);

--- a/caffe2/operators/quantized/int8_add_op.h
+++ b/caffe2/operators/quantized/int8_add_op.h
@ -55,10 +55,8 @@ class Int8AddOp final : public Operator<CPUContext> {

    initQNNPACK();

-#if !defined(FBCODE_CAFFE2) && defined(USE_INTERNAL_PTHREADPOOL_IMPL)
    pthreadpool_t threadpool =
        reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
-#endif

    if (this->qnnpackOperator_ == nullptr) {
      const qnnp_status createStatus = qnnp_create_add_nc_q8(
--- a/caffe2/operators/quantized/int8_channel_shuffle_op.h
+++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h
@ -47,6 +47,7 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase<CPUContext> {
    const auto C = X.t.dim32(3);
    const auto G = this->group_;
    CAFFE_ENFORCE(C % G == 0, "");
+    const auto B = X.t.numel() / C;

    initQNNPACK();

--- a/caffe2/operators/quantized/int8_conv_op.h
+++ b/caffe2/operators/quantized/int8_conv_op.h
@ -60,10 +60,8 @@ class Int8ConvOp final : public ConvPoolOpBase<CPUContext> {
    runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
      initQNNPACK();

-#if !defined(FBCODE_CAFFE2) && defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      pthreadpool_t threadpool =
          reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
-#endif

      if (this->qnnpackObject_ == nullptr) {
        CAFFE_ENFORCE(
--- a/caffe2/operators/quantized/int8_conv_transpose_op.h
+++ b/caffe2/operators/quantized/int8_conv_transpose_op.h
@ -39,12 +39,17 @@ class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
    const auto& W = Inputs()[1]->template Get<Int8TensorCPU>();
    const auto& B = Inputs()[2]->template Get<Int8TensorCPU>();
    auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
+    const auto X_offset = -X.zero_point;
+    const auto W_offset = -W.zero_point;
    const int32_t Y_offset =
        this->template GetSingleArgument<int>("Y_zero_point", 0);
    double Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
    Y->scale = Y_scale;
    Y->zero_point = Y_offset;

+    const auto N = X.t.size(0);
+    const auto IH = X.t.size(1);
+    const auto IW = X.t.size(2);
    const auto IC = X.t.size(3);

    CHECK_EQ(IC, W.t.size(0));
@ -59,10 +64,8 @@ class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
    runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
      initQNNPACK();

-#if !defined(FBCODE_CAFFE2) && defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      pthreadpool_t threadpool =
          reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
-#endif

      if (this->qnnpackObject_ == nullptr) {
        const qnnp_status createStatus = qnnp_create_deconvolution2d_nhwc_q8(
--- a/caffe2/operators/quantized/int8_fc_op.h
+++ b/caffe2/operators/quantized/int8_fc_op.h
@ -47,10 +47,8 @@ class Int8FCOp final : public Operator<CPUContext> {
    runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
      initQNNPACK();

-#if !defined(FBCODE_CAFFE2) && defined(USE_INTERNAL_PTHREADPOOL_IMPL)
      pthreadpool_t threadpool =
          reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
-#endif

      if (this->qnnpackObject_ == nullptr) {
        const qnnp_status createStatus = qnnp_create_fully_connected_nc_q8(
--- a/caffe2/operators/quantized/int8_quantize_op.h
+++ b/caffe2/operators/quantized/int8_quantize_op.h
@ -19,10 +19,10 @@ void Int8Quantize(
    const int64_t N,
    const float Y_scale,
    const int32_t Y_offset) {
+  const float inv_scale = 1.0f / Y_scale;
  uint32_t i = 0;

 #ifdef INT8_NEON_SIMD
-  const float inv_scale = 1.0f / Y_scale;
  const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
  // magic float and magic int to take care of rounding
  // int magic_round(float f): interpret_int32(f + 12582912.0f) - 0x4B400000
--- a/caffe2/operators/quantized/int8_softmax_op.h
+++ b/caffe2/operators/quantized/int8_softmax_op.h
@ -38,6 +38,7 @@ class Int8SoftmaxOp final : public Operator<CPUContext> {
     * in-place, we may overwrite these parameters later, when we set
     * quantization parameters for output tensor.
     */
+    const uint8_t X_zero_point = X.zero_point;
    const float X_scale = X.scale;

    Y->scale = Y_scale;
--- a/caffe2/operators/text_file_reader.cc
+++ b/caffe2/operators/text_file_reader.cc
@ -141,6 +141,8 @@ class TextFileReaderReadOp : public Operator<CPUContext> {
                  (field > 0 && token.startDelimId == 1),
              "Invalid number of columns at row ",
              instance->rowsRead + rowsRead + 1);
+          // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+          const auto& meta = instance->fieldMetas[field];
          char*& data = datas[field];
          convert(
              (TensorProto_DataType)instance->fieldTypes[field],
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@ -686,6 +686,8 @@ class ScatterAssignOp : public Operator<Context> {
    const auto dataType = TypeMetaToDataType(data.dtype());
    const auto slicesType = TypeMetaToDataType(slices.dtype());
    const auto indicesType = TypeMetaToDataType(indices.dtype());
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+    auto* output = Output(0);

    auto runner = GetRunner(dataType, slicesType, indicesType);
    (this->*runner)();
--- a/caffe2/operators/weighted_sample_op.cc
+++ b/caffe2/operators/weighted_sample_op.cc
@ -57,6 +57,8 @@ bool WeightedSampleOp<float, CPUContext>::RunOnDevice() {
      }
    }
  } else {
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable,clang-analyzer-deadcode.DeadStores)
+    auto* out_idx = Output(0, {0}, at::dtype<int>());
    if (OutputSize() == 2) {
      auto* out_value = Output(1, {0}, at::dtype<float>());
      out_value->template mutable_data<float>();
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@ -436,6 +436,13 @@ void BoundShapeInferencer::InferSparseLengthsSum(const OperatorDef& op) {
       op.type() == "SparseLengthsWeightedSum4BitRowwiseSparse" ||
       op.type() == "SparseLengthsSum4BitRowwiseSparse");

+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
+  const bool isSparse =
+      (op.type() == "SparseLengthsSum4BitRowwiseSparse" ||
+       op.type() == "SparseLengthsWeightedSum4BitRowwiseSparse" ||
+       op.type() == "SparseLengthsSum8BitRowwiseSparse" ||
+       op.type() == "SparseLengthsWeightedSum8BitRowwiseSparse");
+
  if (weight) {
    CAFFE_ENFORCE_GE(
        op.input_size(),
--- a/caffe2/opt/optimize_ideep.cc
+++ b/caffe2/opt/optimize_ideep.cc
@ -533,6 +533,8 @@ bool fuseActivation(repr::NNModule* nn, caffe2::Workspace* ws) {
      continue;
    }
    auto relu_node = consumers.front();
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable,clang-analyzer-deadcode.DeadStores)
+    auto relu = repr::nn::get<repr::Relu>(relu_node);

    auto relu_outputs = repr::nn::getOutputs(relu_node);
    if (relu_outputs.size() != 1) {
@ -891,6 +893,10 @@ void preConvertFiltersFormat(repr::NNModule* nn, caffe2::Workspace* ws) {
      initValue(strides, {1, 1});
      auto pads = convTranspose->getPads();
      initValue(pads, {0, 0, 0, 0});
+      // NOLINTNEXTLINE(clang-diagnostic-unused-variable,clang-analyzer-deadcode.DeadStores)
+      auto* op = getMutableOpDef(*convTranspose);
+      // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+      auto aalgorithm = ialgo::deconvolution_direct;
      auto dataType = filter->get_data_type();
      ideep::tensor::dims filter_dims_mkldnn{filter->get_dim(1),
                                             filter->get_dim(0),
--- a/caffe2/quantization/server/dynamic_histogram.cc
+++ b/caffe2/quantization/server/dynamic_histogram.cc
@ -64,12 +64,12 @@ void RemapHistograms(Histogram& src_hist, Histogram& dst_hist) {
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    float dst_bin_begin = dst_hist.Min() + dst_bin_width * dst_bin;
    float dst_bin_end = dst_bin_begin + dst_bin_width;
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable,clang-analyzer-deadcode.DeadStores)
    int dst_bin2 =
        // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
        dst_bin_width == 0 ? 0 : (src_bin_end - dst_hist.Min()) / dst_bin_width;
    // 1 src_bin is mapped to at most 2 dst bin
    assert(dst_bin2 <= dst_bin + 2);
-    (void)dst_bin2;

    // dst_bin_cnt is the count from src_bin that should go to dst_bin
    // The remainder should go to dst_bin2
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@ -698,8 +698,9 @@ TypeIdentifier Int8ConvDNNLowpPackedWeightBlobShapeFunctions::GetTypeMetaId() {

 TypeMeta Int8FCDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType(
    const void* c) {
-  // const Int8FCDNNLowPPackedWeightBlob* int8_tensor =
-  //     reinterpret_cast<const Int8FCDNNLowPPackedWeightBlob*>(c);
+  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+  const Int8FCDNNLowPPackedWeightBlob* int8_tensor =
+      reinterpret_cast<const Int8FCDNNLowPPackedWeightBlob*>(c);
  // We forced the output type to be uint8_t since we know it always is.
  // If it is going to be implemented elsewhere, we might need to change here.
  // return (int8_tensor->original_tensor).dtype();
@ -708,8 +709,9 @@ TypeMeta Int8FCDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType(

 TypeMeta Int8ConvDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType(
    const void* c) {
-  // const Int8ConvDNNLowPPackedWeightBlob* int8_tensor =
-  //     reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(c);
+  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+  const Int8ConvDNNLowPPackedWeightBlob* int8_tensor =
+      reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(c);
  // return (int8_tensor->original_tensor).dtype();
  return TypeMeta::Make<uint8_t>();
 }
--- a/caffe2/quantization/server/p99.cc
+++ b/caffe2/quantization/server/p99.cc
@ -21,7 +21,11 @@ TensorQuantizationParams P99::ChooseQuantizationParams(
  float org_min = min;
  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
  float bin_width = (max - min) / nbins;
+  // NOLINTNEXTLINE(clang-diagnostic-unused-variable,bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,clang-analyzer-deadcode.DeadStores)
+  int zero_bin = round(-min / bin_width);

+  // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+  int best_width = 0;
  double total_sum = 0;
  for (int i = 0; i < nbins; ++i) {
    total_sum += bins_f[i];
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@ -152,7 +152,8 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
  auto& filter = Input(1);
  auto* Y = Output(0);
  CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
-  const int C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
+  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
  CAFFE_ENFORCE(filter.ndim() == 4, "");
  const int M = filter.dim32(0);
  CAFFE_ENFORCE(C % this->group_ == 0, "");
@ -180,6 +181,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
    biasData = dummyBias_.data();
  }

+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
+  const size_t batch_size = X.dim32(0);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
+  const size_t input_channels = X.dim32(1);
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
+  const size_t output_channels = Y->dim32(1);
  const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
                               .height = static_cast<size_t>(X.dim32(2))};
  // filter is MCHW
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@ -53,9 +53,6 @@ endif()
 add_executable(test_api ${TORCH_API_TEST_SOURCES})
 target_include_directories(test_api PRIVATE ${ATen_CPU_INCLUDE})
 target_link_libraries(test_api PRIVATE torch gtest)
-if(NOT MSVC)
-  target_compile_options(test_api PRIVATE -Wno-unused-variable)
-endif()

 if(USE_CUDA)
  target_link_libraries(test_api PRIVATE
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@ -97,9 +97,6 @@ endif(MSVC)

 target_link_libraries(test_jit PRIVATE ${JIT_TEST_DEPENDENCIES})
 target_include_directories(test_jit PRIVATE ${ATen_CPU_INCLUDE})
-if(NOT MSVC)
-  target_compile_options(test_jit PRIVATE -Wno-unused-variable)
-endif()

 if(LINUX)
  #Update to target_link_options when CMake version can be upgraded
--- a/test/cpp/jit/test_backend_compiler_lib.cpp
+++ b/test/cpp/jit/test_backend_compiler_lib.cpp
@ -89,6 +89,7 @@ class BackendWithCompiler : public PyTorchBackendInterface {
    at::Tensor h = val1.toTensor();

    c10::List<at::Tensor> output_list;
+    double scalar_val = 1.0;
    for (const auto& token : handle.toList()) {
      IValue val = token;
      auto instruction = val.toTuple()->elements()[0].toStringRef();
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@ -38,9 +38,6 @@ add_executable(test_tensorexpr
 target_link_libraries(test_tensorexpr PRIVATE torch gtest)
 target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
 target_compile_definitions(test_tensorexpr PRIVATE USE_GTEST)
-if(NOT MSVC)
-  target_compile_options(test_tensorexpr PRIVATE -Wno-unused-variable)
-endif()

 add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp)
 target_link_libraries(tutorial_tensorexpr PRIVATE torch)
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -354,10 +354,6 @@ if(USE_PRECOMPILED_HEADERS)
      "$<$<COMPILE_LANGUAGE:CXX>:ATen/ATen.h>")
 endif()

-if(NOT MSVC)
-  target_compile_options(torch_python PRIVATE -Wno-unused-variable)
-endif()
-
 # Required workaround for generated sources
 # See https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories
 add_dependencies(torch_python generate-torch-sources)
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@ -270,7 +270,6 @@ c10::intrusive_ptr<ProcessGroupMPI> ProcessGroupMPI::createProcessGroupMPI(
      bool groupComm_updated = false;
      MPI_Barrier(MPI_COMM_WORLD);
      for (const auto i : c10::irange(kMaxNumRetries)) {
-        (void)i;
        if (MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm)) {
          groupComm_updated = true;
          break;
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -939,7 +939,6 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
  // created before encountering any communication calls. This is why we need
  // the following for loop.
  for (const auto i : c10::irange(ncclActiveGroupCounter_)) {
-    (void)i;
    C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
  }

@ -979,7 +978,6 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(

  // See [Group Start/End Note]
  for (const auto i : c10::irange(ncclActiveGroupCounter_)) {
-    (void)i;
    C10D_NCCL_CHECK(ncclGroupStart(), c10::nullopt);
  }

--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@ -1383,6 +1383,8 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) {
  auto& replica = bucket.replicas[replica_index];
  for (const auto intra_bucket_index : c10::irange(replica.variables.size())) {
    auto& variable = replica.variables[intra_bucket_index];
+    const auto offset = replica.offsets[intra_bucket_index];
+    const auto length = replica.lengths[intra_bucket_index];

    bool global_unused = false;
    // See Note [Skip allreducing local_used_map_dev]
@ -1632,7 +1634,6 @@ void Reducer::sync_bucket_indices(
    std::vector<size_t> bucket;
    bucket.reserve(bucket_size);
    for (const auto j : c10::irange(bucket_size)) {
-      (void)j;
      bucket.push_back(indices_accessor[indices_accessor_Index++]);
    }
    bucket_indices.emplace_back(std::move(bucket));
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
@ -280,7 +280,7 @@ void ThreadPredicateMap::insert(
 kir::Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const {
  // No thread predicate is needed when tv is an output of a
  // parallel broadcast expression.
-  if (dynamic_cast<BroadcastOp*>(tv->definition())) {
+  if (auto bop = dynamic_cast<BroadcastOp*>(tv->definition())) {
    if (getParallelBroadcastDomains(tv).any()) {
      return kir::IrBuilder(GpuLower::current()->kernel()).trueVal();
    }
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@ -5163,7 +5163,7 @@ std::unique_ptr<Function> CompilationUnit::define(
  if (shouldMangle) {
    // If `shouldMangle` is set, we should generate a unique name for this
    // function if there is already an existing one.
-    if (find_function(name)) {
+    if (auto fn = find_function(name)) {
      name = mangle(name);
    }
  }
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@ -153,7 +153,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
    }
  } else if (auto classType = value_->type()->cast<ClassType>()) {
    // This is a class, emit the proper attribute lookup
-    if (classType->findMethod(field)) {
+    if (auto method = classType->findMethod(field)) {
      return std::make_shared<MethodValue>(getValue(), field);
    }
    if (classType->hasAttribute(field)) {
@ -169,7 +169,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
    }
  } else if (auto iface = value_->type()->cast<InterfaceType>()) {
    // accessing methods of interfaces
-    if (iface->getMethod(field)) {
+    if (auto schema = iface->getMethod(field)) {
      return std::make_shared<MethodValue>(getValue(), field);
    }
  } else if (auto enum_type = value_->type()->cast<EnumType>()) {
--- a/torch/csrc/jit/mobile/promoted_prim_ops.cpp
+++ b/torch/csrc/jit/mobile/promoted_prim_ops.cpp
@ -166,7 +166,7 @@ void numToTensorBool(Stack& stack) {
  push(stack, at::scalar_to_tensor(b));
 }

-static const C10_UNUSED std::array<mobile::prim_op_fn_register, 14> op_reg = {
+static const std::array<mobile::prim_op_fn_register, 14> op_reg = {
    mobile::prim_op_fn_register("prim::TupleIndex", tupleIndex),
    mobile::prim_op_fn_register("aten::Bool.Tensor", boolTensor),
    mobile::prim_op_fn_register("aten::format", aten_format),
--- a/torch/csrc/jit/passes/pass_manager.cpp
+++ b/torch/csrc/jit/passes/pass_manager.cpp
@ -3,9 +3,6 @@
 namespace torch {
 namespace jit {

-// Start UUID at 1
-static GraphPassNameType graphPassID = 1;
-
 std::vector<GraphPassEntry>& getCustomPostPasses() {
  static std::vector<GraphPassEntry> passes;
  return passes;
--- a/torch/csrc/jit/passes/pass_manager.h
+++ b/torch/csrc/jit/passes/pass_manager.h
@ -26,6 +26,8 @@ using GraphPass = std::function<void(std::shared_ptr<Graph>&)>;
 // Since Passes are std::functions, we associate a UUID to each pass, this way
 // if we want to deregister a pass, we have something to reference it by.
 using GraphPassNameType = unsigned int;
+// Start UUID at 1
+static GraphPassNameType graphPassID = 1;

 // Graph pass entries have a name associated with them
 using GraphPassEntry = std::pair<GraphPass, GraphPassNameType>;
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@ -158,8 +158,8 @@ struct CaptureList {
        case CAPTURE_LIST: {
          c10::List<at::Tensor> lst;
          auto size = *size_it++;
+          // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
          for (const auto i : c10::irange(size)) {
-            (void)i;
            lst.emplace_back(var_capture_it->unpack(saved_for));
            var_capture_it++;
          }
--- a/torch/csrc/jit/runtime/register_c10_ops.cpp
+++ b/torch/csrc/jit/runtime/register_c10_ops.cpp
@ -54,7 +54,7 @@ Registerer& registerer() {
 }

 // global instance to run its constructor on startup
-C10_UNUSED Registerer& dummy = registerer();
+Registerer& dummy = registerer();

 } // namespace

--- a/torch/csrc/jit/runtime/script_profile.cpp
+++ b/torch/csrc/jit/runtime/script_profile.cpp
@ -103,7 +103,7 @@ auto initBindings() {
  return nullptr;
 }

-const auto C10_UNUSED torchBindInitializer = initBindings();
+const auto torchBindInitializer = initBindings();

 } // namespace

--- a/torch/csrc/jit/tensorexpr/external_functions.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions.cpp
@ -25,8 +25,8 @@ std::vector<at::Tensor> constructTensors(
  for (const auto i : c10::irange(bufs_num)) {
    buf_data_vec.push_back(buf_data[i]);
    buf_dims_vec.emplace_back();
+    // NOLINTNEXTLINE(clang-diagnostic-unused-variable,clang-analyzer-deadcode.DeadStores)
    for (const auto dim : c10::irange(buf_ranks[i])) {
-      (void)dim;
      buf_dims_vec[i].push_back(buf_dims[buf_dims_idx++]);
    }
    buf_dtypes_vec.push_back(static_cast<c10::ScalarType>(buf_dtypes[i]));
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@ -1151,6 +1151,9 @@ void LLVMCodeGenImpl::visit(LoadPtr v) {
      throw std::runtime_error("invalid dtype in Load");
  }

+  // Detect whether the vector mask is all true
+  bool unmasked_load = true;
+
  // Handle the case where the load is contiguous and unmasked efficiently
  auto idx_ramp = to<Ramp>(v->flat_index());
  if (idx_ramp) {
@ -1802,6 +1805,9 @@ void LLVMCodeGenImpl::visit(IntrinsicsPtr v) {
 }

 void LLVMCodeGenImpl::visit(ExternalCallPtr v) {
+  constexpr int max_buffers = 10;
+  constexpr int max_dimensions = 40;
+
  auto& func_registry = getNNCFunctionRegistry();
  if (!func_registry.count(v->func_name())) {
    throw unimplemented_lowering(v);
--- a/torch/csrc/onnx/onnx.h
+++ b/torch/csrc/onnx/onnx.h
@ -19,7 +19,7 @@ enum class TrainingMode {
 // We pin IR version instead of using onnx::IR_VERSION so that the
 // test_operators.py will be more stable. Only bump it when
 // necessary.
-constexpr size_t IR_VERSION = 7;
-constexpr const char* PRODUCER_VERSION = "1.11";
+static const size_t IR_VERSION = 7;
+static const char* PRODUCER_VERSION = "1.11";
 } // namespace onnx
 } // namespace torch
--- a/torch/csrc/utils/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@ -236,8 +236,8 @@ void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order,
 {
  memcpy(dst, src, sizeof(int16_t) * len);
  if (order != THP_nativeByteOrder()) {
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
    for(const auto i : c10::irange(len)) {
-      (void)i;
      swapBytes16(dst);
      dst += sizeof(int16_t);
    }
@ -248,8 +248,8 @@ void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order,
 {
  memcpy(dst, src, sizeof(int32_t) * len);
  if (order != THP_nativeByteOrder()) {
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
    for(const auto i : c10::irange(len)) {
-      (void)i;
      swapBytes32(dst);
      dst += sizeof(int32_t);
    }
@ -260,8 +260,8 @@ void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order,
 {
  memcpy(dst, src, sizeof(int64_t) * len);
  if (order != THP_nativeByteOrder()) {
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
    for(const auto i : c10::irange(len)) {
-      (void)i;
      swapBytes64(dst);
      dst += sizeof(int64_t);
    }
@ -272,8 +272,8 @@ void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, s
 {
  memcpy(dst, src, sizeof(float) * len);
  if (order != THP_nativeByteOrder()) {
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
    for(const auto i : c10::irange(len)) {
-      (void)i;
      swapBytes32(dst);
      dst += sizeof(float);
    }
@ -284,8 +284,8 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order,
 {
  memcpy(dst, src, sizeof(double) * len);
  if (order != THP_nativeByteOrder()) {
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
    for(const auto i : c10::irange(len)) {
-      (void)i;
      swapBytes64(dst);
      dst += sizeof(double);
    }