[caffe2] EnforceFinite: log blobs finiteness in workspace on error (#52892)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52892 When an EnforceFinite check fails this logs all of the tensors in the workspace and whether they are finite or not. This is a little bit hacky since it uses the aten APIs. I've `ifdef`ed the implementation so it should compile fine on xplat and mobile. It's also accessing the workspace directly but since this is a logging op it seems fine to bend the rules. Test Plan: $ buck test //caffe2/caffe2/python/operator_test:enforce_finite_op_test $ buck-out/gen/caffe2/caffe2/python/operator_test/enforce_finite_op_test#binary.par I0225 16:29:46.166507 311548 enforce_finite_op.h:62] blob X isfinite=false Reviewed By: dzhulgakov Differential Revision: D26626336 fbshipit-source-id: f68e219b910a7242f2e72bb4d734c3e84f46eec5
2025-12-06 12:20:52 +01:00 · 2021-02-26 16:43:52 -08:00 · 2021-02-26 16:43:52 -08:00 · 94e23e51c4
commit 94e23e51c4
parent 10087337c7
1 changed files with 30 additions and 3 deletions
--- a/caffe2/operators/enforce_finite_op.h
+++ b/caffe2/operators/enforce_finite_op.h
@ -13,8 +13,8 @@ class EnforceFiniteOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
-  explicit EnforceFiniteOp(Args&&... args)
-      : Operator<Context>(std::forward<Args>(args)...) {}
+  explicit EnforceFiniteOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), ws_(ws) {}

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
@ -24,6 +24,7 @@ class EnforceFiniteOp final : public Operator<Context> {
  bool DoRunWithType();

 private:
+  Workspace* ws_;
  Tensor buffer_{CPU};

  template <typename T>
@ -32,14 +33,40 @@ class EnforceFiniteOp final : public Operator<Context> {
    auto size = input.numel();

    for (auto i = 0; i < size; i++) {
+      auto isfinite = std::isfinite(input_data[i]);
+      if (!isfinite) {
+        LogBlobFiniteness();
+      }
      CAFFE_ENFORCE_FINITE(
-          std::isfinite(input_data[i]),
+        isfinite,
          "Index ",
          i,
          " is not finite (e.g., NaN, Inf): ",
          input_data[i]);
    }
  }
+
+  // LogBlobFiniteness sums every tensor in the workspace and logs whether it's finite or not.
+  void LogBlobFiniteness() {
+    // This uses the aten interfaces to compute the sum and finiteness of the
+    // tensors which are not present by default on xplat and mobile builds.
+#if defined(EXPOSE_C2_OPS) || \
+    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
+    for (const std::string& blob_name : ws_->Blobs()) {
+      try {
+        const auto& blob = ws_->GetBlob(blob_name);
+        if (blob != nullptr && blob->IsType<Tensor>()) {
+          Tensor* c2Tensor = blob->GetMutable<Tensor>();
+          const at::Tensor& tensor = static_cast<at::Tensor>(*c2Tensor);
+          bool blob_finite = tensor.sum().isfinite().cpu().data_ptr<bool>()[0];
+          LOG(INFO) << "blob " << blob_name << " isfinite=" << (blob_finite ? "true" : "false");
+        }
+      } catch (const std::exception& ex) {
+        LOG(ERROR) << "failed to check finiteness for " << blob_name << ": " << ex.what();
+      }
+    }
+#endif
+  }
 };

 } // namespace caffe2