From 6da0e7f84b7802a1e8dfcb67a656fdda0403b88c Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Fri, 26 Jan 2024 13:33:24 +0000
Subject: [PATCH] [Clang-tidy header][17/N] Apply clang-tidy on headers in
 torch/csrc/cuda (#117829)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/117829
Approved by: https://github.com/albanD
---
 torch/csrc/cuda/CUDAPluggableAllocator.cpp |  1 +
 torch/csrc/cuda/Module.cpp                 | 19 ++++++------
 torch/csrc/cuda/Module.h                   |  1 -
 torch/csrc/cuda/Stream.h                   |  1 +
 torch/csrc/cuda/comm.cpp                   | 19 +++++++-----
 torch/csrc/cuda/device_set.h               |  1 +
 torch/csrc/cuda/memory_snapshot.cpp        |  4 +--
 torch/csrc/cuda/memory_snapshot.h          |  2 +-
 torch/csrc/cuda/python_nccl.cpp            | 34 ++++++++++++----------
 torch/csrc/utils/pybind.h                  |  3 --
 10 files changed, 45 insertions(+), 40 deletions(-)
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 34e00674ed8..4a4d8de7bcf 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -100,6 +100,7 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) const {
   cudaStream_t stream =
       c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
   void* r =
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       const_cast<CUDAPluggableAllocator*>(this)->malloc(size, device, stream);
   c10::DataPtr data_ptr = {
       r,
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 08a6137633e..0980f78223e 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -80,17 +80,13 @@ static void poison_fork() {
 // CUDA management methods
 ////////////////////////////////////////////////////////////////////////////////
 
-void THCPModule_setDevice(int device) {
-  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
-}
-
 PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
-  int64_t device = THPUtils_unpackLong(arg);
+  auto device = THPUtils_unpackLong(arg);
 
   torch::utils::cuda_lazy_init();
-  THCPModule_setDevice(device);
+  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -259,6 +255,7 @@ PyObject* THCPModule_setStream_wrap(
           args,
           kwargs,
           "|LLL",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<char**>(kwlist),
           &stream_id,
           &device_index,
@@ -266,11 +263,13 @@ PyObject* THCPModule_setStream_wrap(
   }
 
   auto stream = at::cuda::CUDAStream::unpack3(
-      stream_id, device_index, static_cast<c10::DeviceType>(device_type));
+      stream_id,
+      static_cast<c10::DeviceIndex>(device_index),
+      static_cast<c10::DeviceType>(device_type));
 
   auto device = c10::cuda::current_device();
   if (device != stream.device_index()) {
-    THCPModule_setDevice(stream.device_index());
+    c10::cuda::set_device(stream.device_index());
   }
   at::cuda::setCurrentCUDAStream(stream);
   Py_RETURN_NONE;
@@ -926,7 +925,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
       static_cast<void (*)(
           c10::optional<std::string>,
           c10::optional<std::string>,
-          std::string,
+          const std::string&,
           size_t)>(torch::cuda::_record_memory_history));
 
   m.def("_cuda_isHistoryEnabled", []() {
@@ -1211,7 +1210,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
           }
         }
         auto delta = c10::cuda::CUDACachingAllocator::setCheckpointPoolState(
-            device, pps);
+            device, std::move(pps));
         auto& freed_pointers = delta.ptrs_freed;
 
         std::unordered_set<void*> allocd_set;
diff --git a/torch/csrc/cuda/Module.h b/torch/csrc/cuda/Module.h
index 23d9079f146..0c89e4bc65f 100644
--- a/torch/csrc/cuda/Module.h
+++ b/torch/csrc/cuda/Module.h
@@ -1,7 +1,6 @@
 #ifndef THCP_CUDA_MODULE_INC
 #define THCP_CUDA_MODULE_INC
 
-void THCPModule_setDevice(int idx);
 PyObject* THCPModule_getDevice_wrap(PyObject* self);
 PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg);
 PyObject* THCPModule_getDeviceName_wrap(PyObject* self, PyObject* arg);
diff --git a/torch/csrc/cuda/Stream.h b/torch/csrc/cuda/Stream.h
index 6175ac2ea03..9b7197d7439 100644
--- a/torch/csrc/cuda/Stream.h
+++ b/torch/csrc/cuda/Stream.h
@@ -5,6 +5,7 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/python_headers.h>
 
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct THCPStream : THPStream {
   at::cuda::CUDAStream cuda_stream;
 };
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 4863f77fd74..c8bbec87cae 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -103,8 +103,9 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntArrayRef devices) {
     if (device != tensor.get_device()) {
       diff_device_dst_tensors.emplace_back(at::empty(
           tensor.sizes(),
-          tensor.options().device(
-              at::Device(DeviceType::CUDA, device)))); // preserve memory format
+          tensor.options().device(at::Device(
+              DeviceType::CUDA,
+              static_cast<DeviceIndex>(device))))); // preserve memory format
     }
   }
   _broadcast_out_impl(tensor, diff_device_dst_tensors);
@@ -178,7 +179,7 @@ tensor_list2d broadcast_coalesced(
     o.reserve(tensors.size());
 
   unique_type_checker type_checker;
-  at::cuda::CUDAGuard device_guard(devices[0]);
+  at::cuda::CUDAGuard device_guard(static_cast<DeviceIndex>(devices[0]));
   for (auto& chunk : torch::utils::take_tensors(tensors, buffer_size)) {
     auto type_id = chunk.type_id();
     type_checker.show(type_id);
@@ -189,7 +190,7 @@ tensor_list2d broadcast_coalesced(
       auto broadcast_values = broadcast(flat_tuple.second, devices);
       results.reserve(devices.size());
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
-        device_guard.set_index(devices[i]);
+        device_guard.set_index(static_cast<DeviceIndex>(devices[i]));
         auto& device_outputs = outputs[i];
         auto& inds = broadcast_indices[i];
         auto& vals = broadcast_values[i];
@@ -203,7 +204,7 @@ tensor_list2d broadcast_coalesced(
       auto results = broadcast(
           torch::utils::flatten_dense_tensors(chunk.tensors), devices);
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
-        device_guard.set_index(devices[i]);
+        device_guard.set_index(static_cast<DeviceIndex>(devices[i]));
         auto& device_outputs = outputs[i];
         for (auto& var :
              torch::utils::unflatten_dense_tensors(results[i], chunk.tensors)) {
@@ -327,10 +328,10 @@ std::vector<at::Tensor> scatter(
         chunk_sizes->size());
   }
   dim = at::maybe_wrap_dim(dim, tensor);
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> chunks = chunk_sizes
       ? tensor.split_with_sizes(/*split_sizes=*/*chunk_sizes, /*dim=*/dim)
-      : tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim);
+      : tensor.chunk(
+            /*chunks=*/static_cast<int64_t>(devices.size()), /*dim=*/dim);
   at::cuda::OptionalCUDAStreamGuard cuda_guard;
   for (const auto i : c10::irange(chunks.size())) {
     const auto device_index = static_cast<int16_t>(devices[i]);
@@ -494,7 +495,9 @@ at::Tensor gather(
   at::Device device(DeviceType::CPU);
   if (!destination_index || *destination_index != -1) {
     device = at::Device(
-        DeviceType::CUDA, destination_index ? *destination_index : -1);
+        DeviceType::CUDA,
+        destination_index ? static_cast<DeviceIndex>(*destination_index)
+                          : DeviceIndex(-1));
   }
 
   at::Tensor result =
diff --git a/torch/csrc/cuda/device_set.h b/torch/csrc/cuda/device_set.h
index 82fa34294d3..acd6f6e6b37 100644
--- a/torch/csrc/cuda/device_set.h
+++ b/torch/csrc/cuda/device_set.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <bitset>
+#include <cstddef>
 
 namespace torch {
 
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 1d6ece35cb1..f1b8e405a40 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -68,7 +68,7 @@ std::vector<IValue> ivalue_symbolize(
     for (const auto& e : t) {
       l.push_back(all_frames.at(e));
     }
-    py_unique_frames.push_back(std::move(l));
+    py_unique_frames.emplace_back(std::move(l));
   }
 
   std::vector<IValue> result;
@@ -132,7 +132,7 @@ static void checkOptionIn(
 void _record_memory_history(
     c10::optional<std::string> enabled,
     c10::optional<std::string> context,
-    std::string stacks,
+    const std::string& stacks,
     size_t max_entries) {
   if (enabled) {
     checkOptionIn(
diff --git a/torch/csrc/cuda/memory_snapshot.h b/torch/csrc/cuda/memory_snapshot.h
index dfffbcb5a88..db39553cb7b 100644
--- a/torch/csrc/cuda/memory_snapshot.h
+++ b/torch/csrc/cuda/memory_snapshot.h
@@ -18,7 +18,7 @@ TORCH_CUDA_CU_API void _record_memory_history(
 TORCH_CUDA_CU_API void _record_memory_history(
     c10::optional<std::string> enabled = "all",
     c10::optional<std::string> context = "all",
-    std::string stacks = "all",
+    const std::string& stacks = "all",
     size_t max_entries = UINT64_MAX);
 
 TORCH_CUDA_CU_API std::string _memory_snapshot_pickled();
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 2bd497e4721..db6f6c68070 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -21,7 +21,7 @@ using namespace torch::cuda::nccl::detail;
 static const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
 
 PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args) {
-  return PyInt_FromLong(version());
+  return PyLong_FromUnsignedLongLong(version());
 }
 
 PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args) {
@@ -99,10 +99,10 @@ static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
 
 PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  int nranks;
-  const char* id;
-  Py_ssize_t id_len;
-  int rank;
+  int nranks = 0;
+  const char* id = nullptr;
+  Py_ssize_t id_len = 0;
+  int rank = 0;
 
   if (!PyArg_ParseTuple(
           args, "is#i:nccl_init_rank", &nranks, &id, &id_len, &rank)) {
@@ -118,7 +118,7 @@ PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
 
   ncclUniqueId commId;
   memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
-  ncclComm_t comm;
+  ncclComm_t comm = nullptr;
   {
     pybind11::gil_scoped_release no_gil;
     comm = comm_init_rank(nranks, commId, rank);
@@ -129,8 +129,9 @@ PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_output, *_streams, *_comms;
-  int root, op;
+  PyObject *_inputs = nullptr, *_output = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int root = 0, op = 0;
 
   if (!PyArg_ParseTuple(
           args, "OOiiOO", &_inputs, &_output, &root, &op, &_streams, &_comms)) {
@@ -161,8 +162,9 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_outputs, *_streams, *_comms;
-  int op;
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int op = 0;
 
   if (!PyArg_ParseTuple(
           args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
@@ -193,8 +195,8 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_streams, *_comms;
-  int root;
+  PyObject *_inputs = nullptr, *_streams = nullptr, *_comms = nullptr;
+  int root = 0;
 
   if (!PyArg_ParseTuple(args, "OiOO", &_inputs, &root, &_streams, &_comms)) {
     THPUtils_invalidArguments(
@@ -224,7 +226,8 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_outputs, *_streams, *_comms;
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
 
   if (!PyArg_ParseTuple(
           args, "OOOO", &_inputs, &_outputs, &_streams, &_comms)) {
@@ -255,8 +258,9 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_outputs, *_streams, *_comms;
-  int op;
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int op = 0;
 
   if (!PyArg_ParseTuple(
           args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index e9d95bb4e97..36cb83659aa 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -15,9 +15,6 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/utils/tensor_memoryformats.h>
 
-#include <stdexcept>
-#include <utility>
-
 namespace py = pybind11;
 
 // This makes intrusive_ptr to be available as a custom pybind11 holder type,