mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49322 In some cases async execution might loose dependencies (Alias like ops) or produce suboptimal scheduling when there is an option which parts to schedule first. Example of the later behavior can happen in ModelParallel training where copy can get lower priority compared to the rest of the execution on the given GPU, which will caused other GPUs to starve. This operator allows to address these issues by introducing extra explicit dependencies between ops. Test Plan: Unit-test/ E2E testing in the future diffs. Reviewed By: xianjiec Differential Revision: D24933471 fbshipit-source-id: 1668994c7856d73926cde022378a99e1e8db3567
31 lines
904 B
C++
31 lines
904 B
C++
#ifndef CAFFE2_OPERATORS_ASYNC_BARRIER_OP_H_
|
|
#define CAFFE2_OPERATORS_ASYNC_BARRIER_OP_H_
|
|
|
|
#include "caffe2/core/context.h"
|
|
#include "caffe2/core/export_caffe2_op_to_c10.h"
|
|
#include "caffe2/core/operator.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <class Context>
|
|
class AsyncNetBarrierOp : public Operator<Context> {
|
|
public:
|
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
USE_SIMPLE_CTOR_DTOR(AsyncNetBarrierOp)
|
|
|
|
bool RunOnDevice() override {
|
|
// This is a pretty much no-op operator, since it's only purposes is make
|
|
// sure that async_scheduling will schedule certian operations earlier than
|
|
// others.
|
|
//
|
|
// Exaple where this operator can work well - mixture of data-parallel and
|
|
// model parallel training, where one wants to force that all copies are
|
|
// started before data-parallel part starts.
|
|
return true;
|
|
}
|
|
};
|
|
|
|
} // namespace caffe2
|
|
|
|
#endif // CAFFE2_OPERATORS_ASYNC_BARRIER_OP_H_
|