[caffe2] EnforceFinite: log blobs finiteness in workspace on error (#52892)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/52892

When an EnforceFinite check fails this logs all of the tensors in the workspace and whether they are finite or not.

This is a little bit hacky since it uses the aten APIs. I've `ifdef`ed the implementation so it should compile fine on xplat and mobile. It's also accessing the workspace directly but since this is a logging op it seems fine to bend the rules.

Test Plan:
$ buck test //caffe2/caffe2/python/operator_test:enforce_finite_op_test

  $ buck-out/gen/caffe2/caffe2/python/operator_test/enforce_finite_op_test#binary.par
  I0225 16:29:46.166507 311548 enforce_finite_op.h:62] blob X isfinite=false

Reviewed By: dzhulgakov

Differential Revision: D26626336

fbshipit-source-id: f68e219b910a7242f2e72bb4d734c3e84f46eec5
This commit is contained in:
Tristan Rice 2021-02-26 16:43:52 -08:00 committed by Facebook GitHub Bot
parent 10087337c7
commit 94e23e51c4

View File

@ -13,8 +13,8 @@ class EnforceFiniteOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
template <class... Args>
explicit EnforceFiniteOp(Args&&... args)
: Operator<Context>(std::forward<Args>(args)...) {}
explicit EnforceFiniteOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws), ws_(ws) {}
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
@ -24,6 +24,7 @@ class EnforceFiniteOp final : public Operator<Context> {
bool DoRunWithType();
private:
Workspace* ws_;
Tensor buffer_{CPU};
template <typename T>
@ -32,14 +33,40 @@ class EnforceFiniteOp final : public Operator<Context> {
auto size = input.numel();
for (auto i = 0; i < size; i++) {
auto isfinite = std::isfinite(input_data[i]);
if (!isfinite) {
LogBlobFiniteness();
}
CAFFE_ENFORCE_FINITE(
std::isfinite(input_data[i]),
isfinite,
"Index ",
i,
" is not finite (e.g., NaN, Inf): ",
input_data[i]);
}
}
// LogBlobFiniteness sums every tensor in the workspace and logs whether it's finite or not.
void LogBlobFiniteness() {
// This uses the aten interfaces to compute the sum and finiteness of the
// tensors which are not present by default on xplat and mobile builds.
#if defined(EXPOSE_C2_OPS) || \
!defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
for (const std::string& blob_name : ws_->Blobs()) {
try {
const auto& blob = ws_->GetBlob(blob_name);
if (blob != nullptr && blob->IsType<Tensor>()) {
Tensor* c2Tensor = blob->GetMutable<Tensor>();
const at::Tensor& tensor = static_cast<at::Tensor>(*c2Tensor);
bool blob_finite = tensor.sum().isfinite().cpu().data_ptr<bool>()[0];
LOG(INFO) << "blob " << blob_name << " isfinite=" << (blob_finite ? "true" : "false");
}
} catch (const std::exception& ex) {
LOG(ERROR) << "failed to check finiteness for " << blob_name << ": " << ex.what();
}
}
#endif
}
};
} // namespace caffe2