fix typo in comments under torch/csrc/distributed (#96062)

This PR fixes typos in comments and messages of `.cpp` and `.hpp` files under `torch/csrc/distributed` directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/96062
Approved by: https://github.com/ngimel
This commit is contained in:
Kazuaki Ishizaki 2023-03-07 02:56:41 +00:00 committed by PyTorch MergeBot
parent fe4fec37a4
commit 2973994259
22 changed files with 32 additions and 32 deletions

View File

@ -31,7 +31,7 @@ static constexpr const char* kNumAutogradContexts = "num_autograd_contexts";
// This hook does 3 things:
// 1. Call pre hooks of the original AccumulateGrad to modify the input grad.
// 2. Accumuate the guard to RPC context.
// 2. Accumurate the guard to RPC context.
// 3. Call post hooks of the original AccumulateGrad.
class DistAccumulateGradCaptureHook
: public GraphTask::ExecInfo::Capture::GradCaptureHook {

View File

@ -96,7 +96,7 @@ class TORCH_API DistEngine {
// traverse the GraphTask instead of using the GraphTask embedded
// cpu_ready_queue, this is because dist engine might run the same GraphTask
// from different SendFunctions concurrently in different threads. The method
// will only mark the GraphTask as completed when it needes to, which means it
// will only mark the GraphTask as completed when it needs to, which means it
// might not mark as completed for every call as dist engine would like to
// keep the GraphTask alive when it not receives all gradients.
//

View File

@ -18,7 +18,7 @@ class TORCH_API RpcWithProfilingResp : public rpc::RpcCommandBase {
std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
rpc::ProfilingId profilingId);
// For receving RPCs. Used in from message when converting a message received
// For receiving RPCs. Used in from message when converting a message received
// over the wire.
RpcWithProfilingResp(
rpc::MessageType messageType,

View File

@ -113,7 +113,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
}
// Gathers a single tensor inputBuffer into a single buffer outputBuffer that
// is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
// is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
// For implementers of ProcessGroup API and advanced users only.
// Note: this function will be deprecated in near future.
virtual c10::intrusive_ptr<Work> _allgather_base(

View File

@ -226,7 +226,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
}
// Gathers a single tensor inputBuffer into a single buffer outputBuffer that
// is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
// is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
// For implementers of ProcessGroup API and advanced users only.
// Note: this function will be deprecated in near future.
virtual c10::intrusive_ptr<Work> _allgather_base(

View File

@ -71,8 +71,8 @@ struct WorkEntry {
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
// group. In other words, no more than 1 process group can be created globally.
//
// If you would like to use multiple ProcessGroupMPI, it requres your MPI
// implemenation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
// If you would like to use multiple ProcessGroupMPI, it requires your MPI
// implementation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
// multiple threads may call MPI, with no restriction.
//
// Also note that ProcessGroupMPI only supports a single Tensor operation. In
@ -229,7 +229,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
c10::intrusive_ptr<Work> barrier(
const BarrierOptions& opts = BarrierOptions()) override;
// Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
// Creating a new ProcessGroupMPI, will initialize MPI if not initialized
static c10::intrusive_ptr<ProcessGroupMPI> createProcessGroupMPI(
std::vector<int> ranks = {});

View File

@ -499,7 +499,7 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
// So explicitly abort ncclComms here before throwing this timed out
// exception to users, after this, ncclCommWatchdog can detect nccl
// communicators are aborted and clean up devNCCLCommMap_ accordingly.
// if throwing timed out excepiton without aborting nccl communicators
// if throwing timed out exception without aborting nccl communicators
// here, it was observed that CUDA GPU will have 100% utilization and
// can not run new events successfully.

View File

@ -136,7 +136,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
void synchronizeStreams();
// Helper function used in CUDA Stream callbacks to complete WorkNCCL
// objects and throw exceptions when neeeded.
// objects and throw exceptions when needed.
void handleNCCLGuard(ErrorHandlingMode asyncErrorHandling);
// Helper function that checks if the NCCL kernels have finished
@ -497,7 +497,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
// Helper that encapsulates work shared across point-to-point communication
// primitives. It is the same structure as the helper used for collective
// communicaiton primitives.
// communication primitives.
template <typename Fn>
c10::intrusive_ptr<Work> pointToPoint(
std::vector<at::Tensor>& tensor,

View File

@ -1494,7 +1494,7 @@ Arguments:
processGroup,
"Options",
R"(
Base class for all processs group options implementations, such as the nccl
Base class for all processes group options implementations, such as the nccl
options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
)")
.def(
@ -2096,7 +2096,7 @@ Example::
``fut.then()`` will return another ``CUDAFuture`` that holds the return value of the
callback and a ``CUDAEvent`` that recorded the callback stream.
1. For CPU work, ``fut.done()`` returns true when work has been complted and value()
1. For CPU work, ``fut.done()`` returns true when work has been completed and value()
tensors are ready.
2. For GPU work, ``fut.done()`` returns true only whether the operation has been enqueued.
3. For mixed CPU-GPU work (e.g. sending GPU tensors with GLOO), ``fut.done()`` returns

View File

@ -69,7 +69,7 @@ class TORCH_API Logger {
);
// Set stats that can be collected only during
// training loop. It is called at the beginning of forward call
// to record the run time stats of sampled iterations that previouly ran.
// to record the run time stats of sampled iterations that previously ran.
// GPU performance stats are collected only for single process
// single device program and single device module right now.
// TODO to support single process multiple devices and multi device modules,

View File

@ -1178,7 +1178,7 @@ void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
if (grad.defined() && !grad.is_alias_of(bucket_view)) {
bucket_view.copy_(grad);
grad = bucket_view;
// The grad is modefied and needs to be written back.
// The grad is modified and needs to be written back.
return true;
}
// The grad is not modified and does not need to be written back.

View File

@ -73,8 +73,8 @@ class TORCH_API Reducer {
// a call to this function can simply be omitted.
void prepare_for_backward(const std::vector<at::Tensor>& outputs);
// Called at the begginning of forward() inside DistributedDataParallel,
// right now it caputures the starting time of forward in each iteration.
// Called at the beginning of forward() inside DistributedDataParallel,
// right now it captures the starting time of forward in each iteration.
void prepare_for_forward();
// Returns the relative time in nanoseconds when gradients were ready,
@ -153,7 +153,7 @@ class TORCH_API Reducer {
// An function for users to set sample_rate of collecting
// runtime stats. The time stats will be recorded for the
// first 10 iterations, after 10 iteratons time stats will be
// first 10 iterations, after 10 iterations time stats will be
// recorded once every "sample_rate" training iterations.
void set_ddp_runtime_logging_sample_rate(int sample_rate);
@ -504,7 +504,7 @@ class TORCH_API Reducer {
// Retrieves parameter names that have not been marked as ready as part of
// previous iteration.
std::vector<std::string> getUnmarkedParamsForIteration();
// Retrives parameter indices that have not been marked as ready as part of
// Retrieves parameter indices that have not been marked as ready as part of
// previous iteration.
std::vector<size_t> getUnmarkedParamIndicesForIteration();
// Raises appropriate error if mark_variable_ready is called on the same

View File

@ -98,7 +98,7 @@ enum MessageType {
// to determine how to serialize them. This design is helpful for
// communicating super large tensors where serializing all the data at
// once leads to excessively large memory footprint. An implementation
// can then serialize and send tensors chunck-by-chunk, in the streaming
// can then serialize and send tensors chunk-by-chunk, in the streaming
// fashion.
// type (MessageType): type of the message.
// id (int64_t): message id, this is used to match request and response.

View File

@ -76,7 +76,7 @@ TORCH_API extern mutexType currentStateStackEntryMutex;
// This class is used to implement a stack of ``State``s.
// It has 2 members.
// One is `prevPtr`, a shared_ptr poiniting to previous elememnt in the
// One is `prevPtr`, a shared_ptr pointing to previous element in the
// stack.
// The other is ``statePtr``, a shared_ptr pointing to ``State``.
class StateStackEntry {

View File

@ -14,7 +14,7 @@ c10::intrusive_ptr<JitFuture> RequestCallback::operator()(
std::vector<c10::Stream> streams) const {
// NB: cannot clear autograd context id here because the processMessage method
// might pause waiting for all RRefs in the arguments to be confirmed by their
// owners and resumne processing in a different thread. Hence, the
// owners and resume processing in a different thread. Hence, the
// thread_local context id needs to be set and cleared in the thread that
// indeed carries out the processing logic.
return processMessage(request, std::move(streams));

View File

@ -125,7 +125,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackImpl::runPythonFunction(
return asFuture(std::current_exception());
}
// After sync exection or failed async execution return the value as-is.
// After sync execution or failed async execution return the value as-is.
if (pythonRpcHandler.isRemoteException(result) || !isAsyncExecution) {
return asFuture(
c10::ivalue::ConcretePyObjectHolder::create(result),

View File

@ -78,7 +78,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
// of 10us.
auto serverProcessGlobalProfilerStateStackEntryPtr =
profiler::processglobal::StateStackEntry::current();
// If server global profiler is enabled, we futher pay the
// If server global profiler is enabled, we further pay the
// cost of thread local profiler state initialization.
if (serverProcessGlobalProfilerStateStackEntryPtr) {
// Initialize thread-local profiler state from process-global

View File

@ -178,7 +178,7 @@ void RpcAgent::retryExpiredRpcs() {
}
// If there are no more RPC's set to be retried at the current timepoint,
// we can remove the corresponsing unordered_set from the retry map.
// we can remove the corresponding unordered_set from the retry map.
if (earliestRpcList.empty()) {
rpcRetryMap_.erase(earliestTimeout);
}

View File

@ -32,7 +32,7 @@ using steady_clock_time_point =
std::chrono::time_point<std::chrono::steady_clock>;
// Input is qualified name string, output is JIT StrongTypePtr
// Same as jit::TypeResolver, did not import jit::TypeResolver to here
// because it could instroduce cyclic dependencies.
// because it could introduce cyclic dependencies.
using TypeResolver =
std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
@ -153,7 +153,7 @@ class TORCH_API RpcAgent {
const DeviceMap& deviceMap = {}) = 0;
// Retries sending the message up to maxRetries times until an ACK is
// receieved. The duration between consecutive sends is increased over
// received. The duration between consecutive sends is increased over
// time using an exponential backoff algorithm.
//
// Sends ``message`` to the ``RpcAgent`` of id ``to`` and returns a
@ -232,7 +232,7 @@ class TORCH_API RpcAgent {
// Retrieve metrics as KV map
virtual std::unordered_map<std::string, std::string> getMetrics() = 0;
// Retrive debug info in addition to metrics as KV map
// Retrieve debug info in addition to metrics as KV map
virtual std::unordered_map<std::string, std::string> getDebugInfo();
// Flag to control whether GIL wait times

View File

@ -180,7 +180,7 @@ class TORCH_API RRefContext {
// been confirmed (i.e. is no longer in the pendingUsers_ map).
c10::intrusive_ptr<RRef> getPendingUser(const ForkId& forkId);
// Start recroding new pending UserRRefs. All pending UserRRefs introduced
// Start recording new pending UserRRefs. All pending UserRRefs introduced
// after this point will be put into the thread_local userTable_, which will
// then be consumed and cleared in waitForThreadLocalPendingRRefs().
void recordThreadLocalPendingRRefs();
@ -264,7 +264,7 @@ class TORCH_API RRefContext {
RRefId::Hash>
forks_;
// This cond var is used by deleteAllUsers(), a event notificaton is sent if
// This cond var is used by deleteAllUsers(), a event notification is sent if
// number of pending UserRRef or UserRRef children is reduced, or
// number of owned OwnerRRef is reduced.
std::condition_variable deleteAllUsersCV_;

View File

@ -111,7 +111,7 @@ class TORCH_API PythonRRefFetchRet final : public RRefFetchRet {
const Message& message);
};
// UserRRef (regardless it's the creator or not) uses this message to notiify
// UserRRef (regardless it's the creator or not) uses this message to notify
// OwnerRRef on delete.
class TORCH_API RRefUserDelete final : public ForkMessageBase {
public:

View File

@ -15,7 +15,7 @@ using torch::jit::Operator;
// A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
// builtin operator. Currently, it does not support using RRef as arguments yet.
// Besides the operator and a vector of arguments, ScriptRemoteCall also
// caontains the RRefId and the ForkId of the return value RRef.
// contains the RRefId and the ForkId of the return value RRef.
class TORCH_API ScriptRemoteCall final : public ScriptCall {
public:
// Constructor for builitin operator call.