More nogil unsafe API fix (#137142)

Cover the PyDict APIs and confirms no update needed for PyModule one. The rest was already covered in https://github.com/pytorch/pytorch/pull/136899 Pull Request resolved: https://github.com/pytorch/pytorch/pull/137142 Approved by: https://github.com/eqy, https://github.com/Skylion007
2025-12-06 12:20:52 +01:00 · 2024-10-04 21:56:32 +00:00 · 2024-10-04 21:56:32 +00:00 · 88e54de219
commit 88e54de219
parent e27c0048db
18 changed files with 1018 additions and 136 deletions
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@ -26,7 +26,7 @@ fi
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

 set +ex
-grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h --exclude=eval_frame.c torch/
+grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h  --exclude=pythoncapi_compat.h --exclude=eval_frame.c torch/
 PYLONG_API_CHECK=$?
 if [[ $PYLONG_API_CHECK == 0 ]]; then
  echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows"
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -2380,7 +2380,14 @@ class CppPythonBindingsCodeCache(CppCodeCache):
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
-            return PyModule_Create(&py_module);
+            PyObject* module = PyModule_Create(&py_module);
+            if (module == NULL) {
+                return NULL;
+            }
+            #ifdef Py_GIL_DISABLED
+                PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
+            #endif
+            return module;
        }
        """
    )
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -1678,6 +1678,10 @@ PyObject* initModule() {
      PyModuleDef_HEAD_INIT, "torch._C", nullptr, -1, methods.data()};
  module = PyModule_Create(&torchmodule);
  ASSERT_TRUE(module);
+#ifdef Py_GIL_DISABLED
+  PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
+#endif
+
  ASSERT_TRUE(THPGenerator_init(module));
  ASSERT_TRUE(THPException_init(module));
  THPSize_init(module);
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@ -367,9 +367,12 @@ void ConcretePyInterpreterVTable::python_dispatcher(
  }

  c10::DispatchKey k = ks.highestPriorityTypeId();
-  // TODO: allow this to be non-owning
-  auto handler = py::reinterpret_borrow<py::object>(
-      PyDict_GetItem(cache.ptr(), py::cast(k).ptr()));
+  PyObject* raw_handler = nullptr;
+  if (PyDict_GetItemRef(cache.ptr(), py::cast(k).ptr(), &raw_handler) < 0) {
+    // Error that is not missing key
+    throw python_error();
+  }
+  auto handler = py::reinterpret_steal<py::object>(raw_handler);
  if (handler.ptr() == nullptr) {
    // Slow path
    handler = torch_api_function_overload.attr("_get_dispatch")(k);
--- a/torch/csrc/autograd/python_anomaly_mode.cpp
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@ -32,9 +32,15 @@ void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
  if (!PyDict_Check(dict())) {
    throw std::runtime_error("Anomaly metadata is not a python dictionary.");
  }
-  PyObject* trace_stack = PyDict_GetItemString(dict(), ANOMALY_TRACE_KEY);
+  PyObject* trace_stack = nullptr;
+  if (PyDict_GetItemStringRef(dict(), ANOMALY_TRACE_KEY, &trace_stack) < 0) {
+    throw python_error();
+  }
  _print_stack(trace_stack, current_node_name, false);
-  PyObject* pyparent(PyDict_GetItemString(dict(), ANOMALY_PARENT_KEY));
+  PyObject* pyparent = nullptr;
+  if (PyDict_GetItemStringRef(dict(), ANOMALY_PARENT_KEY, &pyparent) < 0) {
+    throw python_error();
+  }

  // if there is no "parent_" in metadata, then it means this metadata's node
  // is the root and stop printing the traceback
@ -52,12 +58,18 @@ void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
      throw python_error();
    }
    const std::string parent_name(parent_name_char);
-    PyObject* parent_stack =
-        PyDict_GetItemString(parent_metadata.get(), ANOMALY_TRACE_KEY);
+    PyObject* parent_stack = nullptr;
+    if (PyDict_GetItemStringRef(
+            parent_metadata.get(), ANOMALY_TRACE_KEY, &parent_stack) < 0) {
+      throw python_error();
+    }
    _print_stack(parent_stack, parent_name, true);
    // get the parent of this node, if this node is a root, pyparent is simply
    // null
-    pyparent = PyDict_GetItemString(parent_metadata.get(), ANOMALY_PARENT_KEY);
+    if (PyDict_GetItemStringRef(
+            parent_metadata.get(), ANOMALY_PARENT_KEY, &pyparent) < 0) {
+      throw python_error();
+    }
  }
 }

--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@ -62,10 +62,13 @@ bool _call_hooks(PyObject* dict, PyObject* args) {
  // So, we use `PyDict_Values` which returns a new reference to the values
  // i.e. we hold the reference to the hooks till we have iterated over them.
  // Reference: https://github.com/pytorch/pytorch/issues/58354
+
  auto hooks = THPObjectPtr{PyDict_Values(dict)};
  bool is_modified = false;
  const auto len = PyList_Size(hooks);
  for (Py_ssize_t idx = 0; idx < len; ++idx) {
+    // Note that this call is NoGil safe as the list is created just above and
+    // not accessible by any other thread
    const auto hook = PyList_GetItem(hooks, idx);

    THPObjectPtr res(PyObject_CallObject(hook, args));
@ -176,30 +179,36 @@ auto PyFunctionPostHook::operator()(
 void PyFunctionTensorPreHook::compiled_args(CompiledNodeArgs& args) {
  PyObject *key = nullptr, *value = nullptr;
  Py_ssize_t pos = 0;
+  Py_BEGIN_CRITICAL_SECTION(dict);
  while (PyDict_Next(dict, &pos, &key, &value)) {
    Py_INCREF(value);
    args.add_tensor_pre_hook(
        c10::SafePyObject(value, getPyInterpreter()),
        static_cast<int>(value_idx));
  }
+  Py_END_CRITICAL_SECTION();
 }

 void PyFunctionPreHook::compiled_args(CompiledNodeArgs& args) {
  PyObject *key = nullptr, *value = nullptr;
  Py_ssize_t pos = 0;
+  Py_BEGIN_CRITICAL_SECTION(dict);
  while (PyDict_Next(dict, &pos, &key, &value)) {
    Py_INCREF(value);
    args.add_pre_hook(c10::SafePyObject(value, getPyInterpreter()));
  }
+  Py_END_CRITICAL_SECTION();
 }

 void PyFunctionPostHook::compiled_args(CompiledNodeArgs& args) {
  PyObject *key = nullptr, *value = nullptr;
  Py_ssize_t pos = 0;
+  Py_BEGIN_CRITICAL_SECTION(dict);
  while (PyDict_Next(dict, &pos, &key, &value)) {
    Py_INCREF(value);
    args.add_post_hook(c10::SafePyObject(value, getPyInterpreter()));
  }
+  Py_END_CRITICAL_SECTION();
 }

 PyFunctionTensorPostAccGradHooks::PyFunctionTensorPostAccGradHooks(
@ -231,11 +240,13 @@ void PyFunctionTensorPostAccGradHooks::compiled_args(
    torch::dynamo::autograd::CompiledNodeArgs& args) {
  PyObject *key = nullptr, *value = nullptr;
  Py_ssize_t pos = 0;
+  Py_BEGIN_CRITICAL_SECTION(dict);
  while (PyDict_Next(dict, &pos, &key, &value)) {
    Py_INCREF(value);
    c10::SafePyObject hook_obj(value, getPyInterpreter());
    args.add_post_acc_grad_hook(std::move(hook_obj));
  }
+  Py_END_CRITICAL_SECTION();
 }

 void PyFunctionTensorPostAccGradHooks::apply_with_saved(
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -390,9 +390,11 @@ PyObject* THCPModule_cudaJiteratorCompileAndLaunchKernel(
  PyObject* key = nullptr;
  PyObject* value = nullptr;
  Py_ssize_t pos = 0;
+  Py_BEGIN_CRITICAL_SECTION(kwargs_o);
  while (PyDict_Next(kwargs_o, &pos, &key, &value)) {
    extra_args.emplace_back(as_scalar(value));
  }
+  Py_END_CRITICAL_SECTION();

  c10::SmallVector<at::Tensor> outputs = at::cuda::CompileAndLaunchKernel(
      code_string,
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@ -899,6 +899,10 @@ PyObject* torch_c_dynamo_eval_frame_init(void) {
    return NULL;
  }

+  #ifdef Py_GIL_DISABLED
+    PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
+  #endif
+
 #if IS_PYTHON_3_11_PLUS
  if (PyType_Ready(&THPPyInterpreterFrameType) < 0) {
    return NULL;
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@ -3702,6 +3702,10 @@ PyObject* torch_c_dynamo_guards_init() {
  if (m == nullptr)
    return nullptr;

+#ifdef Py_GIL_DISABLED
+  PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
+#endif
+
  Py_INCREF(&TensorGuardsType);
  if (PyModule_AddObject(m, "TensorGuards", (PyObject*)&TensorGuardsType) < 0) {
    Py_DECREF(&TensorGuardsType);
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@ -38,6 +38,9 @@ void initDynamoBindings(PyObject* torch) {
  if (dynamo == nullptr || PyModule_AddObject(torch, "_dynamo", dynamo) != 0) {
    throw python_error();
  }
+#ifdef Py_GIL_DISABLED
+  PyUnstable_Module_SetGIL(dynamo, Py_MOD_GIL_NOT_USED);
+#endif

  PyObject* eval_frame = torch_c_dynamo_eval_frame_init();
  if (eval_frame == nullptr ||
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@ -820,7 +820,15 @@ static PyObject* set_autograd_compiler(PyObject* dummy, PyObject* args) {
 }

 PyObject* torch_c_dynamo_compiled_autograd_init() {
-  return PyModule_Create(&_module);
+  PyObject* mod = PyModule_Create(&_module);
+  if (mod == nullptr) {
+    return nullptr;
+  }
+
+#ifdef Py_GIL_DISABLED
+  PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
+#endif
+  return mod;
 }

 } // namespace torch::dynamo::autograd
--- a/torch/csrc/dynamo/utils.cpp
+++ b/torch/csrc/dynamo/utils.cpp
@ -25,6 +25,10 @@ PyObject* torch_c_dynamo_utils_init() {
  if (m == nullptr)
    return nullptr;

+#ifdef Py_GIL_DISABLED
+  PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
+#endif
+
  auto py_m = py::handle(m).cast<py::module>();
  py_m.def("is_instancemethod", is_instancemethod);
  return m;
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@ -379,9 +379,11 @@ std::string format_invalid_args(
    PyObject *key = nullptr, *value = nullptr;
    Py_ssize_t pos = 0;

+    Py_BEGIN_CRITICAL_SECTION(given_kwargs);
    while (PyDict_Next(given_kwargs, &pos, &key, &value)) {
      kwargs.emplace(THPUtils_unpackString(key), value);
    }
+    Py_END_CRITICAL_SECTION();
  }

  if (options.size() == 1) {
--- a/torch/csrc/utils/nested.cpp
+++ b/torch/csrc/utils/nested.cpp
@ -48,9 +48,9 @@ at::Tensor nested_tensor_ctor(
  // Check whether we are dealing with lists of tensors or not
  std::vector<at::Tensor> new_list(PyList_Size(data));
  for (const auto i : c10::irange(PyList_Size(data))) {
-    PyObject* elem = PyList_GetItem(data, i);
-    if (THPVariable_Check(elem)) {
-      new_list[i] = THPVariable_Unpack(PyList_GetItem(data, i)).detach();
+    THPObjectPtr elem = THPObjectPtr(PyList_GetItemRef(data, i));
+    if (THPVariable_Check(elem.get())) {
+      new_list[i] = THPVariable_Unpack(elem.get()).detach();
      TORCH_CHECK(
          !new_list[i].is_nested(),
          "We do not accept nested tensors as input to nested tensors");
@ -60,7 +60,7 @@ at::Tensor nested_tensor_ctor(
    } else {
      PythonArgs elem_r(r);
      std::array<PyObject*, 6> elem_args = {
-          elem, // data
+          elem.get(), // data
          r.args[1], // dtpye
          nullptr, // device (cpu)
          nullptr, // no pinned memory
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@ -1447,6 +1447,8 @@ static Py_ssize_t find_param(FunctionSignature& signature, PyObject* name) {
  PyObject* value = nullptr;
  Py_ssize_t pos = 0;

+  // Note that this dict traversal is NoGil safe as the kwargs dict is only
+  // accessible within this thread.
  while (PyDict_Next(kwargs, &pos, &key, &value)) {
    if (!THPUtils_checkString(key)) {
      throw TypeError("keywords must be strings");
@ -1518,6 +1520,8 @@ bool FunctionSignature::parse(
      }
      obj = PyTuple_GET_ITEM(args, arg_pos);
    } else if (kwargs) {
+      // Note that this call is NoGil safe as it works on kwargs which are local
+      // to the current function call.
      obj = PyDict_GetItem(kwargs, param.python_name);
      for (PyObject* numpy_name : param.numpy_python_names) {
        if (obj) {
--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
@ -14,8 +14,7 @@ extern "C" {
 #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
 #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000

-PYCAPI_COMPAT_STATIC_INLINE(int)
-PyCode_GetNCellvars(PyCodeObject* code) {
+static inline int PyCode_GetNCellvars(PyCodeObject* code) {
 // gh-26364 added co_ncellvars to Python 3.11.0rc1
 #if IS_PYTHON_3_11_PLUS
  return code->co_ncellvars;
@ -24,8 +23,7 @@ PyCode_GetNCellvars(PyCodeObject* code) {
 #endif
 }

-PYCAPI_COMPAT_STATIC_INLINE(int)
-PyCode_GetNFreevars(PyCodeObject* code) {
+static inline int PyCode_GetNFreevars(PyCodeObject* code) {
 // gh-26364 added co_nfreevars to Python 3.11.0rc1
 #if IS_PYTHON_3_11_PLUS
  return code->co_nfreevars;
--- a/torch/csrc/utils/pythoncapi_compat.h
+++ b/torch/csrc/utils/pythoncapi_compat.h
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@ -392,7 +392,10 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
  // Extract the `obj.__cuda_array_interface__['shape']` attribute
  std::vector<int64_t> sizes;
  {
-    PyObject* py_shape = PyDict_GetItemString(cuda_dict, "shape");
+    PyObject* py_shape = nullptr;
+    if (PyDict_GetItemStringRef(cuda_dict, "shape", &py_shape) < 0) {
+      throw python_error();
+    }
    if (py_shape == nullptr) {
      throw TypeError("attribute `shape` must exist");
    }
@ -405,7 +408,10 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  int dtype_size_in_bytes;
  {
-    PyObject* py_typestr = PyDict_GetItemString(cuda_dict, "typestr");
+    PyObject* py_typestr = nullptr;
+    if (PyDict_GetItemStringRef(cuda_dict, "typestr", &py_typestr) < 0) {
+      throw python_error();
+    }
    if (py_typestr == nullptr) {
      throw TypeError("attribute `typestr` must exist");
    }
@ -426,7 +432,10 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  void* data_ptr;
  {
-    PyObject* py_data = PyDict_GetItemString(cuda_dict, "data");
+    PyObject* py_data = nullptr;
+    if (PyDict_GetItemStringRef(cuda_dict, "data", &py_data) < 0) {
+      throw python_error();
+    }
    if (py_data == nullptr) {
      throw TypeError("attribute `shape` data exist");
    }
@ -450,7 +459,10 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
  // Extract the `obj.__cuda_array_interface__['strides']` attribute
  std::vector<int64_t> strides;
  {
-    PyObject* py_strides = PyDict_GetItemString(cuda_dict, "strides");
+    PyObject* py_strides = nullptr;
+    if (PyDict_GetItemStringRef(cuda_dict, "strides", &py_strides) < 0) {
+      throw python_error();
+    }
    if (py_strides != nullptr && py_strides != Py_None) {
      if (PySequence_Length(py_strides) == -1 ||
          static_cast<size_t>(PySequence_Length(py_strides)) != sizes.size()) {