mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/14268 Removes the need for Context in Tensor by doing simple dispatch for CopyBytes. It'd eventually be subsumed by Roy Li's changes of proper copy_ op, but before that is done, let's get a clear logic of how copies are implemented and clean up some craft in CopyFrom implementation. Note, that with these changes, one can probably can get rid of Context::CopyFromCPU/CopyToCPU, but it's a matter for follow up diffs. This diff doesn't change the API of Tensor yet, but relies on the fact that passing `Context` to CopyFrom makes copy async if the device is CUDA and doesn't have any effect otherwise (that's how Context methods are implemented). This doesn't change semantics of copy async implementation - as before it blindly calls cudaMemcpyAsync which probably means that it can be misused if invoked separately outside of operator body. I'll leave it for the follow up copy_ unification. For Extend() we always do async copy - it makes sense as it's an in-place device-device operation and only any further op would be observable. Note: there are now three ways of invoking copy in C2 code - templated CopyBytes, virtual CopyFromCPU/etc, and double-dispatch free method here. Hopefully we can get rid of the second one. Also, please advise whether it's c10-worthy :) Reviewed By: ezyang Differential Revision: D13117987 fbshipit-source-id: a6772d6dcf3effaf06717da3a656fc9873b310b5
65 lines
1.5 KiB
C++
65 lines
1.5 KiB
C++
#include "caffe2/core/context.h"
|
|
|
|
#include <atomic>
|
|
#if defined(_MSC_VER)
|
|
#include <process.h>
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
|
|
uint32_t RandomNumberSeed() {
|
|
// Originally copied from folly::randomNumberSeed (at 418ad4)
|
|
// modified to use chrono instead of sys/time.h
|
|
static std::atomic<uint32_t> seedInput(0);
|
|
auto tv = std::chrono::system_clock::now().time_since_epoch();
|
|
uint64_t usec = static_cast<uint64_t>(
|
|
std::chrono::duration_cast<std::chrono::microseconds>(tv).count());
|
|
uint32_t tv_sec = usec / 1000000;
|
|
uint32_t tv_usec = usec % 1000000;
|
|
const uint32_t kPrime0 = 51551;
|
|
const uint32_t kPrime1 = 61631;
|
|
const uint32_t kPrime2 = 64997;
|
|
const uint32_t kPrime3 = 111857;
|
|
return kPrime0 * (seedInput++) + kPrime1 * static_cast<uint32_t>(getpid()) +
|
|
kPrime2 * tv_sec + kPrime3 * tv_usec;
|
|
}
|
|
|
|
namespace {
|
|
inline void CopyBytesImpl(size_t nbytes, const void* src, void* dst) {
|
|
if (nbytes == 0) {
|
|
return;
|
|
}
|
|
CAFFE_ENFORCE(src);
|
|
CAFFE_ENFORCE(dst);
|
|
memcpy(dst, src, nbytes);
|
|
}
|
|
|
|
void CopyBytesWrapper(
|
|
size_t nbytes,
|
|
const void* src,
|
|
Device src_device,
|
|
void* dst,
|
|
Device dst_device) {
|
|
CopyBytesImpl(nbytes, src, dst);
|
|
}
|
|
} // namespace
|
|
|
|
void CPUContext::CopyBytesSameDevice(
|
|
size_t nbytes,
|
|
const void* src,
|
|
void* dst) {
|
|
CopyBytesImpl(nbytes, src, dst);
|
|
}
|
|
|
|
} // namespace caffe2
|
|
|
|
namespace at {
|
|
|
|
REGISTER_CONTEXT(DeviceType::CPU, caffe2::CPUContext);
|
|
|
|
REGISTER_COPY_BYTES_FUNCTION(
|
|
DeviceType::CPU,
|
|
DeviceType::CPU,
|
|
caffe2::CopyBytesWrapper);
|
|
} // namespace at
|