mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
fix typo in comments under torch/csrc/distributed (#96062)
This PR fixes typos in comments and messages of `.cpp` and `.hpp` files under `torch/csrc/distributed` directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/96062 Approved by: https://github.com/ngimel
This commit is contained in:
parent
fe4fec37a4
commit
2973994259
|
|
@ -31,7 +31,7 @@ static constexpr const char* kNumAutogradContexts = "num_autograd_contexts";
|
|||
|
||||
// This hook does 3 things:
|
||||
// 1. Call pre hooks of the original AccumulateGrad to modify the input grad.
|
||||
// 2. Accumuate the guard to RPC context.
|
||||
// 2. Accumurate the guard to RPC context.
|
||||
// 3. Call post hooks of the original AccumulateGrad.
|
||||
class DistAccumulateGradCaptureHook
|
||||
: public GraphTask::ExecInfo::Capture::GradCaptureHook {
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@ class TORCH_API DistEngine {
|
|||
// traverse the GraphTask instead of using the GraphTask embedded
|
||||
// cpu_ready_queue, this is because dist engine might run the same GraphTask
|
||||
// from different SendFunctions concurrently in different threads. The method
|
||||
// will only mark the GraphTask as completed when it needes to, which means it
|
||||
// will only mark the GraphTask as completed when it needs to, which means it
|
||||
// might not mark as completed for every call as dist engine would like to
|
||||
// keep the GraphTask alive when it not receives all gradients.
|
||||
//
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ class TORCH_API RpcWithProfilingResp : public rpc::RpcCommandBase {
|
|||
std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
|
||||
rpc::ProfilingId profilingId);
|
||||
|
||||
// For receving RPCs. Used in from message when converting a message received
|
||||
// For receiving RPCs. Used in from message when converting a message received
|
||||
// over the wire.
|
||||
RpcWithProfilingResp(
|
||||
rpc::MessageType messageType,
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
|
|||
}
|
||||
|
||||
// Gathers a single tensor inputBuffer into a single buffer outputBuffer that
|
||||
// is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
|
||||
// is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
|
||||
// For implementers of ProcessGroup API and advanced users only.
|
||||
// Note: this function will be deprecated in near future.
|
||||
virtual c10::intrusive_ptr<Work> _allgather_base(
|
||||
|
|
|
|||
|
|
@ -226,7 +226,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
|
|||
}
|
||||
|
||||
// Gathers a single tensor inputBuffer into a single buffer outputBuffer that
|
||||
// is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
|
||||
// is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
|
||||
// For implementers of ProcessGroup API and advanced users only.
|
||||
// Note: this function will be deprecated in near future.
|
||||
virtual c10::intrusive_ptr<Work> _allgather_base(
|
||||
|
|
|
|||
|
|
@ -71,8 +71,8 @@ struct WorkEntry {
|
|||
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
|
||||
// group. In other words, no more than 1 process group can be created globally.
|
||||
//
|
||||
// If you would like to use multiple ProcessGroupMPI, it requres your MPI
|
||||
// implemenation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
|
||||
// If you would like to use multiple ProcessGroupMPI, it requires your MPI
|
||||
// implementation to have a thread support value of MPI_THREAD_MULTIPLE, that is,
|
||||
// multiple threads may call MPI, with no restriction.
|
||||
//
|
||||
// Also note that ProcessGroupMPI only supports a single Tensor operation. In
|
||||
|
|
@ -229,7 +229,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
|
|||
c10::intrusive_ptr<Work> barrier(
|
||||
const BarrierOptions& opts = BarrierOptions()) override;
|
||||
|
||||
// Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
|
||||
// Creating a new ProcessGroupMPI, will initialize MPI if not initialized
|
||||
static c10::intrusive_ptr<ProcessGroupMPI> createProcessGroupMPI(
|
||||
std::vector<int> ranks = {});
|
||||
|
||||
|
|
|
|||
|
|
@ -499,7 +499,7 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
|
|||
// So explicitly abort ncclComms here before throwing this timed out
|
||||
// exception to users, after this, ncclCommWatchdog can detect nccl
|
||||
// communicators are aborted and clean up devNCCLCommMap_ accordingly.
|
||||
// if throwing timed out excepiton without aborting nccl communicators
|
||||
// if throwing timed out exception without aborting nccl communicators
|
||||
// here, it was observed that CUDA GPU will have 100% utilization and
|
||||
// can not run new events successfully.
|
||||
|
||||
|
|
|
|||
|
|
@ -136,7 +136,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
|||
void synchronizeStreams();
|
||||
|
||||
// Helper function used in CUDA Stream callbacks to complete WorkNCCL
|
||||
// objects and throw exceptions when neeeded.
|
||||
// objects and throw exceptions when needed.
|
||||
void handleNCCLGuard(ErrorHandlingMode asyncErrorHandling);
|
||||
|
||||
// Helper function that checks if the NCCL kernels have finished
|
||||
|
|
@ -497,7 +497,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
|||
|
||||
// Helper that encapsulates work shared across point-to-point communication
|
||||
// primitives. It is the same structure as the helper used for collective
|
||||
// communicaiton primitives.
|
||||
// communication primitives.
|
||||
template <typename Fn>
|
||||
c10::intrusive_ptr<Work> pointToPoint(
|
||||
std::vector<at::Tensor>& tensor,
|
||||
|
|
|
|||
|
|
@ -1494,7 +1494,7 @@ Arguments:
|
|||
processGroup,
|
||||
"Options",
|
||||
R"(
|
||||
Base class for all processs group options implementations, such as the nccl
|
||||
Base class for all processes group options implementations, such as the nccl
|
||||
options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
|
||||
)")
|
||||
.def(
|
||||
|
|
@ -2096,7 +2096,7 @@ Example::
|
|||
``fut.then()`` will return another ``CUDAFuture`` that holds the return value of the
|
||||
callback and a ``CUDAEvent`` that recorded the callback stream.
|
||||
|
||||
1. For CPU work, ``fut.done()`` returns true when work has been complted and value()
|
||||
1. For CPU work, ``fut.done()`` returns true when work has been completed and value()
|
||||
tensors are ready.
|
||||
2. For GPU work, ``fut.done()`` returns true only whether the operation has been enqueued.
|
||||
3. For mixed CPU-GPU work (e.g. sending GPU tensors with GLOO), ``fut.done()`` returns
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ class TORCH_API Logger {
|
|||
);
|
||||
// Set stats that can be collected only during
|
||||
// training loop. It is called at the beginning of forward call
|
||||
// to record the run time stats of sampled iterations that previouly ran.
|
||||
// to record the run time stats of sampled iterations that previously ran.
|
||||
// GPU performance stats are collected only for single process
|
||||
// single device program and single device module right now.
|
||||
// TODO to support single process multiple devices and multi device modules,
|
||||
|
|
|
|||
|
|
@ -1178,7 +1178,7 @@ void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
|
|||
if (grad.defined() && !grad.is_alias_of(bucket_view)) {
|
||||
bucket_view.copy_(grad);
|
||||
grad = bucket_view;
|
||||
// The grad is modefied and needs to be written back.
|
||||
// The grad is modified and needs to be written back.
|
||||
return true;
|
||||
}
|
||||
// The grad is not modified and does not need to be written back.
|
||||
|
|
|
|||
|
|
@ -73,8 +73,8 @@ class TORCH_API Reducer {
|
|||
// a call to this function can simply be omitted.
|
||||
void prepare_for_backward(const std::vector<at::Tensor>& outputs);
|
||||
|
||||
// Called at the begginning of forward() inside DistributedDataParallel,
|
||||
// right now it caputures the starting time of forward in each iteration.
|
||||
// Called at the beginning of forward() inside DistributedDataParallel,
|
||||
// right now it captures the starting time of forward in each iteration.
|
||||
void prepare_for_forward();
|
||||
|
||||
// Returns the relative time in nanoseconds when gradients were ready,
|
||||
|
|
@ -153,7 +153,7 @@ class TORCH_API Reducer {
|
|||
|
||||
// An function for users to set sample_rate of collecting
|
||||
// runtime stats. The time stats will be recorded for the
|
||||
// first 10 iterations, after 10 iteratons time stats will be
|
||||
// first 10 iterations, after 10 iterations time stats will be
|
||||
// recorded once every "sample_rate" training iterations.
|
||||
void set_ddp_runtime_logging_sample_rate(int sample_rate);
|
||||
|
||||
|
|
@ -504,7 +504,7 @@ class TORCH_API Reducer {
|
|||
// Retrieves parameter names that have not been marked as ready as part of
|
||||
// previous iteration.
|
||||
std::vector<std::string> getUnmarkedParamsForIteration();
|
||||
// Retrives parameter indices that have not been marked as ready as part of
|
||||
// Retrieves parameter indices that have not been marked as ready as part of
|
||||
// previous iteration.
|
||||
std::vector<size_t> getUnmarkedParamIndicesForIteration();
|
||||
// Raises appropriate error if mark_variable_ready is called on the same
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ enum MessageType {
|
|||
// to determine how to serialize them. This design is helpful for
|
||||
// communicating super large tensors where serializing all the data at
|
||||
// once leads to excessively large memory footprint. An implementation
|
||||
// can then serialize and send tensors chunck-by-chunk, in the streaming
|
||||
// can then serialize and send tensors chunk-by-chunk, in the streaming
|
||||
// fashion.
|
||||
// type (MessageType): type of the message.
|
||||
// id (int64_t): message id, this is used to match request and response.
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ TORCH_API extern mutexType currentStateStackEntryMutex;
|
|||
|
||||
// This class is used to implement a stack of ``State``s.
|
||||
// It has 2 members.
|
||||
// One is `prevPtr`, a shared_ptr poiniting to previous elememnt in the
|
||||
// One is `prevPtr`, a shared_ptr pointing to previous element in the
|
||||
// stack.
|
||||
// The other is ``statePtr``, a shared_ptr pointing to ``State``.
|
||||
class StateStackEntry {
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ c10::intrusive_ptr<JitFuture> RequestCallback::operator()(
|
|||
std::vector<c10::Stream> streams) const {
|
||||
// NB: cannot clear autograd context id here because the processMessage method
|
||||
// might pause waiting for all RRefs in the arguments to be confirmed by their
|
||||
// owners and resumne processing in a different thread. Hence, the
|
||||
// owners and resume processing in a different thread. Hence, the
|
||||
// thread_local context id needs to be set and cleared in the thread that
|
||||
// indeed carries out the processing logic.
|
||||
return processMessage(request, std::move(streams));
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackImpl::runPythonFunction(
|
|||
return asFuture(std::current_exception());
|
||||
}
|
||||
|
||||
// After sync exection or failed async execution return the value as-is.
|
||||
// After sync execution or failed async execution return the value as-is.
|
||||
if (pythonRpcHandler.isRemoteException(result) || !isAsyncExecution) {
|
||||
return asFuture(
|
||||
c10::ivalue::ConcretePyObjectHolder::create(result),
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
|
|||
// of 10us.
|
||||
auto serverProcessGlobalProfilerStateStackEntryPtr =
|
||||
profiler::processglobal::StateStackEntry::current();
|
||||
// If server global profiler is enabled, we futher pay the
|
||||
// If server global profiler is enabled, we further pay the
|
||||
// cost of thread local profiler state initialization.
|
||||
if (serverProcessGlobalProfilerStateStackEntryPtr) {
|
||||
// Initialize thread-local profiler state from process-global
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ void RpcAgent::retryExpiredRpcs() {
|
|||
}
|
||||
|
||||
// If there are no more RPC's set to be retried at the current timepoint,
|
||||
// we can remove the corresponsing unordered_set from the retry map.
|
||||
// we can remove the corresponding unordered_set from the retry map.
|
||||
if (earliestRpcList.empty()) {
|
||||
rpcRetryMap_.erase(earliestTimeout);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ using steady_clock_time_point =
|
|||
std::chrono::time_point<std::chrono::steady_clock>;
|
||||
// Input is qualified name string, output is JIT StrongTypePtr
|
||||
// Same as jit::TypeResolver, did not import jit::TypeResolver to here
|
||||
// because it could instroduce cyclic dependencies.
|
||||
// because it could introduce cyclic dependencies.
|
||||
using TypeResolver =
|
||||
std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
|
||||
|
||||
|
|
@ -153,7 +153,7 @@ class TORCH_API RpcAgent {
|
|||
const DeviceMap& deviceMap = {}) = 0;
|
||||
|
||||
// Retries sending the message up to maxRetries times until an ACK is
|
||||
// receieved. The duration between consecutive sends is increased over
|
||||
// received. The duration between consecutive sends is increased over
|
||||
// time using an exponential backoff algorithm.
|
||||
//
|
||||
// Sends ``message`` to the ``RpcAgent`` of id ``to`` and returns a
|
||||
|
|
@ -232,7 +232,7 @@ class TORCH_API RpcAgent {
|
|||
// Retrieve metrics as KV map
|
||||
virtual std::unordered_map<std::string, std::string> getMetrics() = 0;
|
||||
|
||||
// Retrive debug info in addition to metrics as KV map
|
||||
// Retrieve debug info in addition to metrics as KV map
|
||||
virtual std::unordered_map<std::string, std::string> getDebugInfo();
|
||||
|
||||
// Flag to control whether GIL wait times
|
||||
|
|
|
|||
|
|
@ -180,7 +180,7 @@ class TORCH_API RRefContext {
|
|||
// been confirmed (i.e. is no longer in the pendingUsers_ map).
|
||||
c10::intrusive_ptr<RRef> getPendingUser(const ForkId& forkId);
|
||||
|
||||
// Start recroding new pending UserRRefs. All pending UserRRefs introduced
|
||||
// Start recording new pending UserRRefs. All pending UserRRefs introduced
|
||||
// after this point will be put into the thread_local userTable_, which will
|
||||
// then be consumed and cleared in waitForThreadLocalPendingRRefs().
|
||||
void recordThreadLocalPendingRRefs();
|
||||
|
|
@ -264,7 +264,7 @@ class TORCH_API RRefContext {
|
|||
RRefId::Hash>
|
||||
forks_;
|
||||
|
||||
// This cond var is used by deleteAllUsers(), a event notificaton is sent if
|
||||
// This cond var is used by deleteAllUsers(), a event notification is sent if
|
||||
// number of pending UserRRef or UserRRef children is reduced, or
|
||||
// number of owned OwnerRRef is reduced.
|
||||
std::condition_variable deleteAllUsersCV_;
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ class TORCH_API PythonRRefFetchRet final : public RRefFetchRet {
|
|||
const Message& message);
|
||||
};
|
||||
|
||||
// UserRRef (regardless it's the creator or not) uses this message to notiify
|
||||
// UserRRef (regardless it's the creator or not) uses this message to notify
|
||||
// OwnerRRef on delete.
|
||||
class TORCH_API RRefUserDelete final : public ForkMessageBase {
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ using torch::jit::Operator;
|
|||
// A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
|
||||
// builtin operator. Currently, it does not support using RRef as arguments yet.
|
||||
// Besides the operator and a vector of arguments, ScriptRemoteCall also
|
||||
// caontains the RRefId and the ForkId of the return value RRef.
|
||||
// contains the RRefId and the ForkId of the return value RRef.
|
||||
class TORCH_API ScriptRemoteCall final : public ScriptCall {
|
||||
public:
|
||||
// Constructor for builitin operator call.
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user