Using std::vector or c10::SmallVector instead of CArray (#160959)

As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160959 Approved by: https://github.com/Skylion007
2025-12-06 00:20:18 +01:00 · 2025-08-19 18:45:58 +08:00 · 2025-08-19 18:45:58 +08:00 · 0f801a510f
commit 0f801a510f
parent 576a0e64ed
6 changed files with 38 additions and 37 deletions
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@ -7,6 +7,7 @@
 #include <algorithm>
 #include <iterator>
 #include <numeric>
+#include <vector>

 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
@ -647,10 +648,10 @@ _vec_softmax(
  parallel_for(
      0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
        int64_t idx = begin;
-        auto temp_vec_input = std::make_unique<float[]>(dim_size * vectorized_step);
-        auto temp_vec_output = std::make_unique<float[]>(dim_size * vectorized_step);
-        float* temp_vec_input_data = temp_vec_input.get();
-        float* temp_vec_output_data = temp_vec_output.get();
+        std::vector<float> temp_vec_input(dim_size * vectorized_step);
+        std::vector<float> temp_vec_output(dim_size * vectorized_step);
+        float* temp_vec_input_data = temp_vec_input.data();
+        float* temp_vec_output_data = temp_vec_output.data();
        while (idx < end) {
          int64_t outer_idx = idx / inner_size;
          int64_t inner_idx = idx % inner_size;
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@ -285,7 +285,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
        sizeof(algos) / sizeof(algos[0]) == num_algos,
        "Missing cuDNN convolution forward algorithms");
    int perf_count;
-    auto perf_results = std::make_unique<perf_t[]>(num_algos);
+    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
    if (!benchmark) {
      AT_CUDNN_CHECK_WITH_SHAPES(
          cudnnGetConvolutionForwardAlgorithm_v7(
@ -296,7 +296,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
              args.odesc.desc(),
              num_algos,
              &perf_count,
-              perf_results.get()),
+              perf_results.data()),
          args);
    } else {
      size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@ -314,7 +314,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
              args.output.data_ptr(),
              num_algos,
              &perf_count,
-              perf_results.get(),
+              perf_results.data(),
              ws.data,
              ws.size),
          args);
@ -324,7 +324,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
      // memory, e.g. a few GBs.
      c10::cuda::CUDACachingAllocator::emptyCache();
    }
-    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
  }

  static void getWorkspaceSize(
@ -369,7 +369,8 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
        sizeof(algos) / sizeof(algos[0]) == num_algos,
        "Missing cuDNN convolution backward data algorithms.");
    int perf_count;
-    auto perf_results = std::make_unique<perf_t[]>(num_algos);
+    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
+        perf_results;
    if (!benchmark) {
      AT_CUDNN_CHECK_WITH_SHAPES(
          cudnnGetConvolutionBackwardDataAlgorithm_v7(
@ -380,7 +381,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
              args.idesc.desc(),
              num_algos,
              &perf_count,
-              perf_results.get()),
+              perf_results.data()),
          args);
    } else {
      size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@ -398,7 +399,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
              args.input.data_ptr(),
              num_algos,
              &perf_count,
-              perf_results.get(),
+              perf_results.data(),
              ws.data,
              ws.size),
          args);
@ -408,7 +409,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
      // memory, e.g. a few GBs.
      c10::cuda::CUDACachingAllocator::emptyCache();
    }
-    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
  }

  static void getWorkspaceSize(
@ -456,7 +457,8 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
    static_assert(
        sizeof(algos) / sizeof(algos[0]) == num_algos,
        "Missing cuDNN convolution backward filter algorithms.");
-    auto perf_results = std::make_unique<perf_t[]>(num_algos);
+    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
+        perf_results;
    int perf_count;
    if (!benchmark) {
      AT_CUDNN_CHECK_WITH_SHAPES(
@ -468,7 +470,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
              args.wdesc.desc(),
              num_algos,
              &perf_count,
-              perf_results.get()),
+              perf_results.data()),
          args);
    } else {
      size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@ -486,7 +488,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
              args.weight.data_ptr(),
              num_algos,
              &perf_count,
-              perf_results.get(),
+              perf_results.data(),
              ws.data,
              ws.size),
          args);
@ -496,7 +498,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
      // memory, e.g. a few GBs.
      c10::cuda::CUDACachingAllocator::emptyCache();
    }
-    return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
+    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
  }

  static void getWorkspaceSize(
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
@ -17,6 +17,7 @@
 #include <c10/util/irange.h>

 #include <cstring>
+#include <vector>


 namespace at::native {
@ -53,8 +54,8 @@ static void upsample_nearest2d_out_frame(
    return;
  }

-  auto input_offset_arr = std::make_unique<int64_t[]>(output_width);
-  int64_t* input_offset = input_offset_arr.get();
+  std::vector<int64_t> input_offset_arr(output_width);
+  int64_t* input_offset = input_offset_arr.data();

  for (const auto w2 : c10::irange(output_width)) {
    const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@ -800,7 +800,7 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
  Tensor indices_dim1 = indices[1].to(ScalarType::Int);
  Tensor indices_dim2 = indices[2].to(ScalarType::Int);

-  auto mat_el_end_indices_host = std::make_unique<int64_t[]>(num_matrices);
+  std::vector<int64_t> mat_el_end_indices_host(num_matrices);

  {
    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@ -809,14 +809,14 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor

    search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0);
    AT_CUDA_CHECK(cudaMemcpy(
-      mat_el_end_indices_host.get(),
+      mat_el_end_indices_host.data(),
      mat_el_end_indices_device,
      num_matrices*sizeof(int64_t),
      cudaMemcpyDeviceToHost
    ));
  }
  // Need a pointer to an array to access within a lambda
-  int64_t* mat_el_end_indices = &mat_el_end_indices_host[0];
+  int64_t* mat_el_end_indices = mat_el_end_indices_host.data();

  Scalar beta = 0;
  Scalar alpha = 1;
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@ -528,16 +528,16 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
  // use. Note: if the hostname does not resolve to an address (e.g.
  // because of misconfigured /etc/hosts file), this will not work.
  const auto hostNameMax = sysconf(_SC_HOST_NAME_MAX);
-  auto hostname = std::make_unique<char[]>(hostNameMax);
-  auto rv = gethostname(hostname.get(), hostNameMax);
+  std::string hostname(hostNameMax, '\0');
+  auto rv = gethostname(hostname.data(), hostNameMax);
  if (rv != 0) {
    C10_THROW_ERROR(DistBackendError, c10::utils::str_error(errno));
  }

  // Use this machine's hostname if it resolves to an address.
-  if (doesHostnameResolveToUsableAddress(hostname.get())) {
+  if (doesHostnameResolveToUsableAddress(hostname.data())) {
    return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
-        hostname.get(), lazyInit);
+        hostname.data(), lazyInit);
  }

  // Otherwise, use the loopback address.
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@ -351,16 +351,14 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
        _storage_nbytes);
  }

-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  std::unique_ptr<char[]> cpu_data;
+  std::string cpu_data;

  uint8_t* data{};
  if (storage->device_type() == at::kCPU) {
    data = static_cast<uint8_t*>(storage->mutable_data());
  } else {
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    cpu_data = std::make_unique<char[]>(nbytes);
-    data = (uint8_t*)cpu_data.get();
+    cpu_data.resize(nbytes);
+    data = (uint8_t*)cpu_data.data();
  }

  // fast track for bytes and little endian
@ -370,24 +368,23 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
    doRead(file, data, storage->nbytes());
  } else {
    int64_t buffer_size = std::min(size, (int64_t)5000);
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    std::unique_ptr<uint8_t[]> le_buffer(
-        new uint8_t[buffer_size * element_size]);
+    std::vector<uint8_t> le_buffer;
+    le_buffer.resize(buffer_size * element_size);

    for (int64_t i = 0; i < size; i += buffer_size) {
      size_t to_convert = std::min(size - i, buffer_size);
-      doRead(file, le_buffer.get(), element_size * to_convert);
+      doRead(file, le_buffer.data(), element_size * to_convert);

      // NOLINTNEXTLINE(bugprone-branch-clone)
      if (element_size == 2) {
        torch::utils::THP_decodeBuffer(
-            (int16_t*)data + i, le_buffer.get(), true, to_convert);
+            (int16_t*)data + i, le_buffer.data(), true, to_convert);
      } else if (element_size == 4) {
        torch::utils::THP_decodeBuffer(
-            (int32_t*)data + i, le_buffer.get(), true, to_convert);
+            (int32_t*)data + i, le_buffer.data(), true, to_convert);
      } else if (element_size == 8) {
        torch::utils::THP_decodeBuffer(
-            (int64_t*)data + i, le_buffer.get(), true, to_convert);
+            (int64_t*)data + i, le_buffer.data(), true, to_convert);
      }
    }
  }