mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/37681 By passing by value, we can std::move, and avoid unnecessarily copying args that are part of any std::function/lambda state (e.g. in the jit interpreter, there is a std::vector<> stack passed in the InterpreterContinuation) This makes the api also consistent with e.g. folly and best practices. Added a minor at::launch() benchmark to test/cpp/, the difference is mostly noticeable when copying the std::function<> internal args is non-trivial. Benchmarks pre/post (min over ~5 runs) NoData: 5.81 us -> 5.63 us (-3.2%) WithData(0): 6.67 us -> 5.88 us (-11.8%) WithData(4): 6.98 us -> 6.51 us (-6.7%) WithData(256): 9.44 us -> 7.89 (-16.5%) ghstack-source-id: 103322321 Test Plan: - perf: buck run mode/opt caffe2/test/cpp/api:parallel_benchmark pre/post - correctness buck test mode/dev-nosan caffe2/test/... Reviewed By: dzhulgakov Differential Revision: D21355148 fbshipit-source-id: 3567e730845106f1991091e4a892d093e00571c3
89 lines
2.1 KiB
C++
89 lines
2.1 KiB
C++
#include <torch/torch.h>
|
|
#include <chrono>
|
|
#include <condition_variable>
|
|
#include <mutex>
|
|
|
|
class Baton {
|
|
public:
|
|
void post() {
|
|
std::unique_lock<std::mutex> l(lock_);
|
|
done_ = true;
|
|
cv_.notify_all();
|
|
}
|
|
void wait() {
|
|
std::unique_lock<std::mutex> l(lock_);
|
|
while (!done_) {
|
|
cv_.wait(l);
|
|
}
|
|
}
|
|
|
|
private:
|
|
std::mutex lock_;
|
|
std::condition_variable cv_;
|
|
bool done_{false};
|
|
};
|
|
|
|
void AtLaunch_Base(int32_t numIters) {
|
|
struct Helper {
|
|
explicit Helper(int32_t lim) : limit_(lim) {}
|
|
void operator()() {
|
|
if (++val_ == limit_) {
|
|
done.post();
|
|
} else {
|
|
at::launch([this]() { (*this)(); });
|
|
}
|
|
}
|
|
int val_{0};
|
|
int limit_;
|
|
Baton done;
|
|
};
|
|
Helper h(numIters);
|
|
auto start = std::chrono::system_clock::now();
|
|
h();
|
|
h.done.wait();
|
|
std::cout << "NoData "
|
|
<< static_cast<double>(
|
|
std::chrono::duration_cast<std::chrono::microseconds>(
|
|
std::chrono::system_clock::now() - start)
|
|
.count()) /
|
|
static_cast<double>(numIters)
|
|
<< " usec/each\n";
|
|
}
|
|
|
|
void AtLaunch_WithData(int32_t numIters, int32_t vecSize) {
|
|
struct Helper {
|
|
explicit Helper(int32_t lim) : limit_(lim) {}
|
|
void operator()(std::vector<int32_t> v) {
|
|
if (++val_ == limit_) {
|
|
done.post();
|
|
} else {
|
|
at::launch([this, v = std::move(v)]() { (*this)(v); });
|
|
}
|
|
}
|
|
int val_{0};
|
|
int limit_;
|
|
Baton done;
|
|
};
|
|
Helper h(numIters);
|
|
std::vector<int32_t> v(vecSize, 0);
|
|
auto start = std::chrono::system_clock::now();
|
|
h(v);
|
|
h.done.wait();
|
|
std::cout << "WithData(" << vecSize << "): "
|
|
<< static_cast<double>(
|
|
std::chrono::duration_cast<std::chrono::microseconds>(
|
|
std::chrono::system_clock::now() - start)
|
|
.count()) /
|
|
static_cast<double>(numIters)
|
|
<< " usec/each\n";
|
|
}
|
|
|
|
int main(int argc, char** argv) {
|
|
int32_t N = 1000000;
|
|
AtLaunch_Base(N);
|
|
AtLaunch_WithData(N, 0);
|
|
AtLaunch_WithData(N, 4);
|
|
AtLaunch_WithData(N, 256);
|
|
return 0;
|
|
}
|