fix typo in comments under torch/csrc/distributed (#96062)

This PR fixes typos in comments and messages of `.cpp` and `.hpp` files under `torch/csrc/distributed` directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/96062 Approved by: https://github.com/ngimel
2025-12-07 12:21:27 +01:00 · 2023-03-07 02:56:41 +00:00 · 2023-03-07 02:56:41 +00:00 · 2973994259
commit 2973994259
parent fe4fec37a4
22 changed files with 32 additions and 32 deletions
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@ -31,7 +31,7 @@ static constexpr const char* kNumAutogradContexts = "num_autograd_contexts";

 // This hook does 3 things:
 //   1. Call pre hooks of the original AccumulateGrad to modify the input grad.
-//   2. Accumuate the guard to RPC context.
+//   2. Accumurate the guard to RPC context.
 //   3. Call post hooks of the original AccumulateGrad.
 class DistAccumulateGradCaptureHook
    : public GraphTask::ExecInfo::Capture::GradCaptureHook {
--- a/torch/csrc/distributed/autograd/engine/dist_engine.h
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.h
@ -96,7 +96,7 @@ class TORCH_API DistEngine {
  // traverse the GraphTask instead of using the GraphTask embedded
  // cpu_ready_queue, this is because dist engine might run the same GraphTask
  // from different SendFunctions concurrently in different threads. The method
-  // will only mark the GraphTask as completed when it needes to, which means it
+  // will only mark the GraphTask as completed when it needs to, which means it
  // might not mark as completed for every call as dist engine would like to
  // keep the GraphTask alive when it not receives all gradients.
  //
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
@ -18,7 +18,7 @@ class TORCH_API RpcWithProfilingResp : public rpc::RpcCommandBase {
      std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
      rpc::ProfilingId profilingId);

-  // For receving RPCs. Used in from message when converting a message received
+  // For receiving RPCs. Used in from message when converting a message received
  // over the wire.
  RpcWithProfilingResp(
      rpc::MessageType messageType,
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@ -113,7 +113,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
  }

  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
-  // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
  // For implementers of ProcessGroup API and advanced users only.
  // Note: this function will be deprecated in near future.
  virtual c10::intrusive_ptr<Work> _allgather_base(
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@ -226,7 +226,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
  }

  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
-  // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
  // For implementers of ProcessGroup API and advanced users only.
  // Note: this function will be deprecated in near future.
  virtual c10::intrusive_ptr<Work> _allgather_base(
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@ -71,8 +71,8 @@ struct WorkEntry {
 // MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
 // group. In other words, no more than 1 process group can be created globally.
 //
-// If you would like to use multiple ProcessGroupMPI, it requres your MPI
-// implemenation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
+// If you would like to use multiple ProcessGroupMPI, it requires your MPI
+// implementation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
 // multiple threads may call MPI, with no restriction.
 //
 // Also note that ProcessGroupMPI only supports a single Tensor operation. In
@ -229,7 +229,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
  c10::intrusive_ptr<Work> barrier(
      const BarrierOptions& opts = BarrierOptions()) override;

-  // Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
+  // Creating a new ProcessGroupMPI, will initialize MPI if not initialized
  static c10::intrusive_ptr<ProcessGroupMPI> createProcessGroupMPI(
      std::vector<int> ranks = {});

--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -499,7 +499,7 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
        // So explicitly abort ncclComms here before throwing this timed out
        // exception to users, after this, ncclCommWatchdog can detect nccl
        // communicators are aborted and clean up devNCCLCommMap_ accordingly.
-        // if throwing timed out excepiton without aborting nccl communicators
+        // if throwing timed out exception without aborting nccl communicators
        // here, it was observed that CUDA GPU will have 100% utilization and
        // can not run new events successfully.

--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@ -136,7 +136,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
    void synchronizeStreams();

    // Helper function used in CUDA Stream callbacks to complete WorkNCCL
-    // objects and throw exceptions when neeeded.
+    // objects and throw exceptions when needed.
    void handleNCCLGuard(ErrorHandlingMode asyncErrorHandling);

    // Helper function that checks if the NCCL kernels have finished
@ -497,7 +497,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {

  // Helper that encapsulates work shared across point-to-point communication
  // primitives. It is the same structure as the helper used for collective
-  // communicaiton primitives.
+  // communication primitives.
  template <typename Fn>
  c10::intrusive_ptr<Work> pointToPoint(
      std::vector<at::Tensor>& tensor,
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@ -1494,7 +1494,7 @@ Arguments:
          processGroup,
          "Options",
          R"(
-Base class for all processs group options implementations, such as the nccl
+Base class for all processes group options implementations, such as the nccl
 options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
 )")
          .def(
@ -2096,7 +2096,7 @@ Example::
                ``fut.then()`` will return another ``CUDAFuture`` that holds the return value of the
                callback and a ``CUDAEvent`` that recorded the callback stream.

-                    1. For CPU work, ``fut.done()`` returns true when work has been complted and value()
+                    1. For CPU work, ``fut.done()`` returns true when work has been completed and value()
                       tensors are ready.
                    2. For GPU work, ``fut.done()`` returns true only whether the operation has been enqueued.
                    3. For mixed CPU-GPU work (e.g. sending GPU tensors with GLOO), ``fut.done()`` returns
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@ -69,7 +69,7 @@ class TORCH_API Logger {
  );
  // Set stats that can be collected only during
  // training loop. It is called at the beginning of forward call
-  // to record the run time stats of sampled iterations that previouly ran.
+  // to record the run time stats of sampled iterations that previously ran.
  // GPU performance stats are collected only for single process
  // single device program and single device module right now.
  // TODO to support single process multiple devices and multi device modules,
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@ -1178,7 +1178,7 @@ void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
        if (grad.defined() && !grad.is_alias_of(bucket_view)) {
          bucket_view.copy_(grad);
          grad = bucket_view;
-          // The grad is modefied and needs to be written back.
+          // The grad is modified and needs to be written back.
          return true;
        }
        // The grad is not modified and does not need to be written back.
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@ -73,8 +73,8 @@ class TORCH_API Reducer {
  // a call to this function can simply be omitted.
  void prepare_for_backward(const std::vector<at::Tensor>& outputs);

-  // Called at the begginning of forward() inside DistributedDataParallel,
-  // right now it caputures the starting time of forward in each iteration.
+  // Called at the beginning of forward() inside DistributedDataParallel,
+  // right now it captures the starting time of forward in each iteration.
  void prepare_for_forward();

  // Returns the relative time in nanoseconds when gradients were ready,
@ -153,7 +153,7 @@ class TORCH_API Reducer {

  // An function for users to set sample_rate of collecting
  // runtime stats. The time stats will be recorded for the
-  // first 10 iterations, after 10 iteratons time stats will be
+  // first 10 iterations, after 10 iterations time stats will be
  // recorded once every "sample_rate" training iterations.
  void set_ddp_runtime_logging_sample_rate(int sample_rate);

@ -504,7 +504,7 @@ class TORCH_API Reducer {
  // Retrieves parameter names that have not been marked as ready as part of
  // previous iteration.
  std::vector<std::string> getUnmarkedParamsForIteration();
-  // Retrives parameter indices that have not been marked as ready as part of
+  // Retrieves parameter indices that have not been marked as ready as part of
  // previous iteration.
  std::vector<size_t> getUnmarkedParamIndicesForIteration();
  // Raises appropriate error if mark_variable_ready is called on the same
--- a/torch/csrc/distributed/rpc/message.h
+++ b/torch/csrc/distributed/rpc/message.h
@ -98,7 +98,7 @@ enum MessageType {
 //        to determine how to serialize them. This design is helpful for
 //        communicating super large tensors where serializing all the data at
 //        once leads to excessively large memory footprint. An implementation
-//        can then serialize and send tensors chunck-by-chunk, in the streaming
+//        can then serialize and send tensors chunk-by-chunk, in the streaming
 //        fashion.
 //    type (MessageType): type of the message.
 //    id (int64_t): message id, this is used to match request and response.
--- a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
+++ b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
@ -76,7 +76,7 @@ TORCH_API extern mutexType currentStateStackEntryMutex;

 // This class is used to implement a stack of ``State``s.
 // It has 2 members.
-// One is `prevPtr`, a shared_ptr poiniting to previous elememnt in the
+// One is `prevPtr`, a shared_ptr pointing to previous element in the
 // stack.
 // The other is ``statePtr``, a shared_ptr pointing to ``State``.
 class StateStackEntry {
--- a/torch/csrc/distributed/rpc/request_callback.cpp
+++ b/torch/csrc/distributed/rpc/request_callback.cpp
@ -14,7 +14,7 @@ c10::intrusive_ptr<JitFuture> RequestCallback::operator()(
    std::vector<c10::Stream> streams) const {
  // NB: cannot clear autograd context id here because the processMessage method
  // might pause waiting for all RRefs in the arguments to be confirmed by their
-  // owners and resumne processing in a different thread. Hence, the
+  // owners and resume processing in a different thread. Hence, the
  // thread_local context id needs to be set and cleared in the thread that
  // indeed carries out the processing logic.
  return processMessage(request, std::move(streams));
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@ -125,7 +125,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackImpl::runPythonFunction(
    return asFuture(std::current_exception());
  }

-  // After sync exection or failed async execution return the value as-is.
+  // After sync execution or failed async execution return the value as-is.
  if (pythonRpcHandler.isRemoteException(result) || !isAsyncExecution) {
    return asFuture(
        c10::ivalue::ConcretePyObjectHolder::create(result),
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@ -78,7 +78,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
          // of 10us.
          auto serverProcessGlobalProfilerStateStackEntryPtr =
              profiler::processglobal::StateStackEntry::current();
-          // If server global profiler is enabled, we futher pay the
+          // If server global profiler is enabled, we further pay the
          // cost of thread local profiler state initialization.
          if (serverProcessGlobalProfilerStateStackEntryPtr) {
            // Initialize thread-local profiler state from process-global
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@ -178,7 +178,7 @@ void RpcAgent::retryExpiredRpcs() {
    }

    // If there are no more RPC's set to be retried at the current timepoint,
-    // we can remove the corresponsing unordered_set from the retry map.
+    // we can remove the corresponding unordered_set from the retry map.
    if (earliestRpcList.empty()) {
      rpcRetryMap_.erase(earliestTimeout);
    }
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@ -32,7 +32,7 @@ using steady_clock_time_point =
    std::chrono::time_point<std::chrono::steady_clock>;
 // Input is qualified name string, output is JIT StrongTypePtr
 // Same as jit::TypeResolver, did not import jit::TypeResolver to here
-// because it could instroduce cyclic dependencies.
+// because it could introduce cyclic dependencies.
 using TypeResolver =
    std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;

@ -153,7 +153,7 @@ class TORCH_API RpcAgent {
      const DeviceMap& deviceMap = {}) = 0;

  // Retries sending the message up to maxRetries times until an ACK is
-  // receieved. The duration between consecutive sends is increased over
+  // received. The duration between consecutive sends is increased over
  // time using an exponential backoff algorithm.
  //
  // Sends ``message`` to the ``RpcAgent`` of id ``to`` and returns a
@ -232,7 +232,7 @@ class TORCH_API RpcAgent {
  // Retrieve metrics as KV map
  virtual std::unordered_map<std::string, std::string> getMetrics() = 0;

-  // Retrive debug info in addition to metrics as KV map
+  // Retrieve debug info in addition to metrics as KV map
  virtual std::unordered_map<std::string, std::string> getDebugInfo();

  // Flag to control whether GIL wait times
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@ -180,7 +180,7 @@ class TORCH_API RRefContext {
  // been confirmed (i.e. is no longer in the pendingUsers_ map).
  c10::intrusive_ptr<RRef> getPendingUser(const ForkId& forkId);

-  // Start recroding new pending UserRRefs. All pending UserRRefs introduced
+  // Start recording new pending UserRRefs. All pending UserRRefs introduced
  // after this point will be put into the thread_local userTable_, which will
  // then be consumed and cleared in waitForThreadLocalPendingRRefs().
  void recordThreadLocalPendingRRefs();
@ -264,7 +264,7 @@ class TORCH_API RRefContext {
      RRefId::Hash>
      forks_;

-  // This cond var is used by deleteAllUsers(), a event notificaton is sent if
+  // This cond var is used by deleteAllUsers(), a event notification is sent if
  // number of pending UserRRef or UserRRef children is reduced, or
  // number of owned OwnerRRef is reduced.
  std::condition_variable deleteAllUsersCV_;
--- a/torch/csrc/distributed/rpc/rref_proto.h
+++ b/torch/csrc/distributed/rpc/rref_proto.h
@ -111,7 +111,7 @@ class TORCH_API PythonRRefFetchRet final : public RRefFetchRet {
      const Message& message);
 };

-// UserRRef (regardless it's the creator or not) uses this message to notiify
+// UserRRef (regardless it's the creator or not) uses this message to notify
 // OwnerRRef on delete.
 class TORCH_API RRefUserDelete final : public ForkMessageBase {
 public:
--- a/torch/csrc/distributed/rpc/script_remote_call.h
+++ b/torch/csrc/distributed/rpc/script_remote_call.h
@ -15,7 +15,7 @@ using torch::jit::Operator;
 // A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
 // builtin operator. Currently, it does not support using RRef as arguments yet.
 // Besides the operator and a vector of arguments, ScriptRemoteCall also
-// caontains the RRefId and the ForkId of the return value RRef.
+// contains the RRefId and the ForkId of the return value RRef.
 class TORCH_API ScriptRemoteCall final : public ScriptCall {
 public:
  // Constructor for builitin operator call.