mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
Using std::vector or c10::SmallVector instead of CArray (#160959)
As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160959 Approved by: https://github.com/Skylion007
This commit is contained in:
parent
576a0e64ed
commit
0f801a510f
|
|
@ -7,6 +7,7 @@
|
|||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Parallel.h>
|
||||
|
|
@ -647,10 +648,10 @@ _vec_softmax(
|
|||
parallel_for(
|
||||
0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t idx = begin;
|
||||
auto temp_vec_input = std::make_unique<float[]>(dim_size * vectorized_step);
|
||||
auto temp_vec_output = std::make_unique<float[]>(dim_size * vectorized_step);
|
||||
float* temp_vec_input_data = temp_vec_input.get();
|
||||
float* temp_vec_output_data = temp_vec_output.get();
|
||||
std::vector<float> temp_vec_input(dim_size * vectorized_step);
|
||||
std::vector<float> temp_vec_output(dim_size * vectorized_step);
|
||||
float* temp_vec_input_data = temp_vec_input.data();
|
||||
float* temp_vec_output_data = temp_vec_output.data();
|
||||
while (idx < end) {
|
||||
int64_t outer_idx = idx / inner_size;
|
||||
int64_t inner_idx = idx % inner_size;
|
||||
|
|
|
|||
|
|
@ -285,7 +285,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
|||
sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution forward algorithms");
|
||||
int perf_count;
|
||||
auto perf_results = std::make_unique<perf_t[]>(num_algos);
|
||||
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
|
||||
if (!benchmark) {
|
||||
AT_CUDNN_CHECK_WITH_SHAPES(
|
||||
cudnnGetConvolutionForwardAlgorithm_v7(
|
||||
|
|
@ -296,7 +296,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
|||
args.odesc.desc(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get()),
|
||||
perf_results.data()),
|
||||
args);
|
||||
} else {
|
||||
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
|
||||
|
|
@ -314,7 +314,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
|||
args.output.data_ptr(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get(),
|
||||
perf_results.data(),
|
||||
ws.data,
|
||||
ws.size),
|
||||
args);
|
||||
|
|
@ -324,7 +324,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
|||
// memory, e.g. a few GBs.
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
|
||||
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
|
||||
}
|
||||
|
||||
static void getWorkspaceSize(
|
||||
|
|
@ -369,7 +369,8 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
|||
sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward data algorithms.");
|
||||
int perf_count;
|
||||
auto perf_results = std::make_unique<perf_t[]>(num_algos);
|
||||
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
|
||||
perf_results;
|
||||
if (!benchmark) {
|
||||
AT_CUDNN_CHECK_WITH_SHAPES(
|
||||
cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
||||
|
|
@ -380,7 +381,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
|||
args.idesc.desc(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get()),
|
||||
perf_results.data()),
|
||||
args);
|
||||
} else {
|
||||
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
|
||||
|
|
@ -398,7 +399,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
|||
args.input.data_ptr(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get(),
|
||||
perf_results.data(),
|
||||
ws.data,
|
||||
ws.size),
|
||||
args);
|
||||
|
|
@ -408,7 +409,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
|||
// memory, e.g. a few GBs.
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
|
||||
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
|
||||
}
|
||||
|
||||
static void getWorkspaceSize(
|
||||
|
|
@ -456,7 +457,8 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
|||
static_assert(
|
||||
sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward filter algorithms.");
|
||||
auto perf_results = std::make_unique<perf_t[]>(num_algos);
|
||||
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
|
||||
perf_results;
|
||||
int perf_count;
|
||||
if (!benchmark) {
|
||||
AT_CUDNN_CHECK_WITH_SHAPES(
|
||||
|
|
@ -468,7 +470,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
|||
args.wdesc.desc(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get()),
|
||||
perf_results.data()),
|
||||
args);
|
||||
} else {
|
||||
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
|
||||
|
|
@ -486,7 +488,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
|||
args.weight.data_ptr(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get(),
|
||||
perf_results.data(),
|
||||
ws.data,
|
||||
ws.size),
|
||||
args);
|
||||
|
|
@ -496,7 +498,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
|||
// memory, e.g. a few GBs.
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
|
||||
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
|
||||
}
|
||||
|
||||
static void getWorkspaceSize(
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include <c10/util/irange.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace at::native {
|
||||
|
|
@ -53,8 +54,8 @@ static void upsample_nearest2d_out_frame(
|
|||
return;
|
||||
}
|
||||
|
||||
auto input_offset_arr = std::make_unique<int64_t[]>(output_width);
|
||||
int64_t* input_offset = input_offset_arr.get();
|
||||
std::vector<int64_t> input_offset_arr(output_width);
|
||||
int64_t* input_offset = input_offset_arr.data();
|
||||
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
|
||||
|
|
|
|||
|
|
@ -800,7 +800,7 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
|
|||
Tensor indices_dim1 = indices[1].to(ScalarType::Int);
|
||||
Tensor indices_dim2 = indices[2].to(ScalarType::Int);
|
||||
|
||||
auto mat_el_end_indices_host = std::make_unique<int64_t[]>(num_matrices);
|
||||
std::vector<int64_t> mat_el_end_indices_host(num_matrices);
|
||||
|
||||
{
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
|
|
@ -809,14 +809,14 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
|
|||
|
||||
search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0);
|
||||
AT_CUDA_CHECK(cudaMemcpy(
|
||||
mat_el_end_indices_host.get(),
|
||||
mat_el_end_indices_host.data(),
|
||||
mat_el_end_indices_device,
|
||||
num_matrices*sizeof(int64_t),
|
||||
cudaMemcpyDeviceToHost
|
||||
));
|
||||
}
|
||||
// Need a pointer to an array to access within a lambda
|
||||
int64_t* mat_el_end_indices = &mat_el_end_indices_host[0];
|
||||
int64_t* mat_el_end_indices = mat_el_end_indices_host.data();
|
||||
|
||||
Scalar beta = 0;
|
||||
Scalar alpha = 1;
|
||||
|
|
|
|||
|
|
@ -528,16 +528,16 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
|
|||
// use. Note: if the hostname does not resolve to an address (e.g.
|
||||
// because of misconfigured /etc/hosts file), this will not work.
|
||||
const auto hostNameMax = sysconf(_SC_HOST_NAME_MAX);
|
||||
auto hostname = std::make_unique<char[]>(hostNameMax);
|
||||
auto rv = gethostname(hostname.get(), hostNameMax);
|
||||
std::string hostname(hostNameMax, '\0');
|
||||
auto rv = gethostname(hostname.data(), hostNameMax);
|
||||
if (rv != 0) {
|
||||
C10_THROW_ERROR(DistBackendError, c10::utils::str_error(errno));
|
||||
}
|
||||
|
||||
// Use this machine's hostname if it resolves to an address.
|
||||
if (doesHostnameResolveToUsableAddress(hostname.get())) {
|
||||
if (doesHostnameResolveToUsableAddress(hostname.data())) {
|
||||
return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
|
||||
hostname.get(), lazyInit);
|
||||
hostname.data(), lazyInit);
|
||||
}
|
||||
|
||||
// Otherwise, use the loopback address.
|
||||
|
|
|
|||
|
|
@ -351,16 +351,14 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
|
|||
_storage_nbytes);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
std::unique_ptr<char[]> cpu_data;
|
||||
std::string cpu_data;
|
||||
|
||||
uint8_t* data{};
|
||||
if (storage->device_type() == at::kCPU) {
|
||||
data = static_cast<uint8_t*>(storage->mutable_data());
|
||||
} else {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
cpu_data = std::make_unique<char[]>(nbytes);
|
||||
data = (uint8_t*)cpu_data.get();
|
||||
cpu_data.resize(nbytes);
|
||||
data = (uint8_t*)cpu_data.data();
|
||||
}
|
||||
|
||||
// fast track for bytes and little endian
|
||||
|
|
@ -370,24 +368,23 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
|
|||
doRead(file, data, storage->nbytes());
|
||||
} else {
|
||||
int64_t buffer_size = std::min(size, (int64_t)5000);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
std::unique_ptr<uint8_t[]> le_buffer(
|
||||
new uint8_t[buffer_size * element_size]);
|
||||
std::vector<uint8_t> le_buffer;
|
||||
le_buffer.resize(buffer_size * element_size);
|
||||
|
||||
for (int64_t i = 0; i < size; i += buffer_size) {
|
||||
size_t to_convert = std::min(size - i, buffer_size);
|
||||
doRead(file, le_buffer.get(), element_size * to_convert);
|
||||
doRead(file, le_buffer.data(), element_size * to_convert);
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-branch-clone)
|
||||
if (element_size == 2) {
|
||||
torch::utils::THP_decodeBuffer(
|
||||
(int16_t*)data + i, le_buffer.get(), true, to_convert);
|
||||
(int16_t*)data + i, le_buffer.data(), true, to_convert);
|
||||
} else if (element_size == 4) {
|
||||
torch::utils::THP_decodeBuffer(
|
||||
(int32_t*)data + i, le_buffer.get(), true, to_convert);
|
||||
(int32_t*)data + i, le_buffer.data(), true, to_convert);
|
||||
} else if (element_size == 8) {
|
||||
torch::utils::THP_decodeBuffer(
|
||||
(int64_t*)data + i, le_buffer.get(), true, to_convert);
|
||||
(int64_t*)data + i, le_buffer.data(), true, to_convert);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user