pytorch/test/cpp/api/parallel_benchmark.cpp
Jeremy Lilley 468a9d448e [aten] Pass std::function<> to thread_pool by value, instead of const ref. (#37681)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37681

By passing by value, we can std::move, and avoid unnecessarily copying
args that are part of any std::function/lambda state (e.g. in the jit
interpreter, there is a std::vector<> stack passed in the
InterpreterContinuation)

This makes the api also consistent with e.g. folly and best practices.
Added a minor at::launch() benchmark to test/cpp/, the difference is
mostly noticeable when copying the std::function<> internal args is
non-trivial.

Benchmarks pre/post (min over ~5 runs)
NoData: 5.81 us -> 5.63 us (-3.2%)
WithData(0): 6.67 us -> 5.88 us (-11.8%)
WithData(4): 6.98 us -> 6.51 us (-6.7%)
WithData(256): 9.44 us -> 7.89 (-16.5%)

ghstack-source-id: 103322321

Test Plan:
- perf: buck run mode/opt caffe2/test/cpp/api:parallel_benchmark pre/post
  - correctness buck test mode/dev-nosan caffe2/test/...

Reviewed By: dzhulgakov

Differential Revision: D21355148

fbshipit-source-id: 3567e730845106f1991091e4a892d093e00571c3
2020-05-05 08:41:38 -07:00

89 lines
2.1 KiB
C++

#include <torch/torch.h>
#include <chrono>
#include <condition_variable>
#include <mutex>
class Baton {
public:
void post() {
std::unique_lock<std::mutex> l(lock_);
done_ = true;
cv_.notify_all();
}
void wait() {
std::unique_lock<std::mutex> l(lock_);
while (!done_) {
cv_.wait(l);
}
}
private:
std::mutex lock_;
std::condition_variable cv_;
bool done_{false};
};
void AtLaunch_Base(int32_t numIters) {
struct Helper {
explicit Helper(int32_t lim) : limit_(lim) {}
void operator()() {
if (++val_ == limit_) {
done.post();
} else {
at::launch([this]() { (*this)(); });
}
}
int val_{0};
int limit_;
Baton done;
};
Helper h(numIters);
auto start = std::chrono::system_clock::now();
h();
h.done.wait();
std::cout << "NoData "
<< static_cast<double>(
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now() - start)
.count()) /
static_cast<double>(numIters)
<< " usec/each\n";
}
void AtLaunch_WithData(int32_t numIters, int32_t vecSize) {
struct Helper {
explicit Helper(int32_t lim) : limit_(lim) {}
void operator()(std::vector<int32_t> v) {
if (++val_ == limit_) {
done.post();
} else {
at::launch([this, v = std::move(v)]() { (*this)(v); });
}
}
int val_{0};
int limit_;
Baton done;
};
Helper h(numIters);
std::vector<int32_t> v(vecSize, 0);
auto start = std::chrono::system_clock::now();
h(v);
h.done.wait();
std::cout << "WithData(" << vecSize << "): "
<< static_cast<double>(
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now() - start)
.count()) /
static_cast<double>(numIters)
<< " usec/each\n";
}
int main(int argc, char** argv) {
int32_t N = 1000000;
AtLaunch_Base(N);
AtLaunch_WithData(N, 0);
AtLaunch_WithData(N, 4);
AtLaunch_WithData(N, 256);
return 0;
}