diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 1bead317780..48f9bc4e61e 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -75,22 +75,44 @@ fi # if you're not careful. Check this if you made some changes and the # ASAN test is not working if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then + # Suppress vptr violations arising from multiple copies of pybind11 export ASAN_OPTIONS=detect_leaks=0:symbolize=1:strict_init_order=true - # We suppress the vptr volation, since we have separate copies of - # libprotobuf in both libtorch.so and libcaffe2.so, and it causes - # the following problem: - # test_cse (__main__.TestJit) ... torch/csrc/jit/export.cpp:622:38: - # runtime error: member call on address ... which does not point - # to an object of type 'google::protobuf::MessageLite' - # ...: note: object is of type 'onnx_torch::ModelProto' - # - # This problem should be solved when libtorch.so and libcaffe2.so are - # merged. export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp export PYTORCH_TEST_WITH_ASAN=1 export PYTORCH_TEST_WITH_UBSAN=1 # TODO: Figure out how to avoid hard-coding these paths export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-5.0/bin/llvm-symbolizer + export TORCH_USE_RTLD_GLOBAL=1 + # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our + # default behavior. + # + # The reason for this is that without RTLD_GLOBAL, if we load multiple + # libraries that depend on libtorch (as is the case with C++ extensions), we + # will get multiple copies of libtorch in our address space. When UBSAN is + # turned on, it will do a bunch of virtual pointer consistency checks which + # won't work correctly. When this happens, you get a violation like: + # + # member call on address XXXXXX which does not point to an object of + # type 'std::_Sp_counted_base<__gnu_cxx::_Lock_policy::_S_atomic>' + # XXXXXX note: object is of type + # 'std::_Sp_counted_ptr' + # + # (NB: the textual types of the objects here are misleading, because + # they actually line up; it just so happens that there's two copies + # of the type info floating around in the address space, so they + # don't pointer compare equal. See also + # https://github.com/google/sanitizers/issues/1175 + # + # UBSAN is kind of right here: if we relied on RTTI across C++ extension + # modules they would indeed do the wrong thing; but in our codebase, we + # don't use RTTI (because it doesn't work in mobile). To appease + # UBSAN, however, it's better if we ensure all the copies agree! + # + # By the way, an earlier version of this code attempted to load + # libtorch_python.so with LD_PRELOAD, which has a similar effect of causing + # it to be loaded globally. This isn't really a good idea though, because + # it depends on a ton of dynamic libraries that most programs aren't gonna + # have, and it applies to child processes. export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so # Increase stack size, because ASAN red zones use more stack ulimit -s 81920 diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index dd7af693901..2d4619ac048 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1085,6 +1085,32 @@ if(USE_CUDA) endif() +# Note [Global dependencies] +# Some libraries (e.g. OpenMPI) like to dlopen plugins after they're initialized, +# and they assume that all of their symbols will be available in the global namespace. +# On the other hand we try to be good citizens and avoid polluting the symbol +# namespaces, so libtorch is loaded with all its dependencies in a local scope. +# That usually leads to missing symbol errors at run-time, so to avoid a situation like +# this we have to preload those libs in a global namespace. +add_library(torch_global_deps SHARED ${TORCH_SRC_DIR}/csrc/empty.c) +set_target_properties(torch_global_deps PROPERTIES LINKER_LANGUAGE C) +if (USE_MPI) + target_link_libraries(torch_global_deps ${MPI_CXX_LIBRARIES}) +endif() +target_link_libraries(torch_global_deps ${MKL_LIBRARIES}) +# The CUDA libraries are linked here for a different reason: in some +# cases we load these libraries with ctypes, and if they weren't opened +# with RTLD_GLOBAL, we'll do the "normal" search process again (and +# not find them, because they're usually in non-standard locations) +if (USE_CUDA) + target_link_libraries(torch_global_deps ${TORCH_CUDA_LIBRARIES}) + target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) + target_link_libraries(torch_global_deps torch::cudart) +endif() + +install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + # ---[ Caffe2 HIP sources. if(USE_ROCM) # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs. diff --git a/torch/__init__.py b/torch/__init__.py index 70a3ddbfe66..8b240675a6b 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -13,8 +13,10 @@ on an NVIDIA GPU with compute capability >= 3.0. import os import sys import platform +import ctypes from ._utils import _import_dotted_name -from ._utils_internal import get_file_path, prepare_multiprocessing_environment +from ._utils_internal import get_file_path, prepare_multiprocessing_environment, \ + USE_RTLD_GLOBAL_WITH_LIBTORCH from .version import __version__ from ._six import string_classes as _string_classes @@ -33,38 +35,54 @@ __all__ = [ # Load the extension module ################################################################################ -# Loading the extension with RTLD_GLOBAL option allows to not link extension -# modules against the _C shared object. Their missing THP symbols will be -# automatically filled by the dynamic loader. -import os as _dl_flags - -# if we have numpy, it *must* be imported before the call to setdlopenflags() -# or there is risk that later c modules will segfault when importing numpy -try: - import numpy as _np -except ImportError: - pass - if platform.system() == 'Windows': - # first get nvToolsExt PATH - def get_nvToolsExt_path(): - NVTOOLEXT_HOME = _dl_flags.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt') + NVTOOLSEXT_PATH = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt') - if _dl_flags.path.exists(NVTOOLEXT_HOME): - return _dl_flags.path.join(NVTOOLEXT_HOME, 'bin', 'x64') - else: - return '' + if os.path.exists(NVTOOLSEXT_PATH): + nvtoolsext_lib_path = os.path.join(NVTOOLSEXT_PATH, 'bin', 'x64') + else: + nvtoolsext_lib_path = '' - py_dll_path = _dl_flags.path.join(sys.exec_prefix, 'Library', 'bin') - th_dll_path = _dl_flags.path.join(_dl_flags.path.dirname(__file__), 'lib') + py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin') + th_dll_path = os.path.join(os.path.dirname(__file__), 'lib') - dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path(), _dl_flags.environ['PATH']] + dll_paths = [th_dll_path, py_dll_path, nvtoolsext_lib_path, os.environ['PATH']] # then add the path to env - _dl_flags.environ['PATH'] = ';'.join(dll_paths) + os.environ['PATH'] = ';'.join(dll_paths) -else: - # first check if the os package has the required flags + +# See Note [Global dependencies] +def _load_global_deps(): + if platform.system() == 'Windows': + return + + lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so') + here = os.path.abspath(__file__) + lib_path = os.path.join(os.path.dirname(here), 'lib', lib_name) + + ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) + + +if (USE_RTLD_GLOBAL_WITH_LIBTORCH or os.getenv('TORCH_USE_RTLD_GLOBAL')) and \ + platform.system() != 'Windows': + # Do it the hard way. You might want to load libtorch with RTLD_GLOBAL in a + # few circumstances: + # + # 1. You're in a build environment (e.g., fbcode) where + # libtorch_global_deps is not available, but you still need + # to get mkl to link in with RTLD_GLOBAL or it will just + # not work. + # + # 2. You're trying to run PyTorch under UBSAN and you need + # to ensure that only one copy of libtorch is loaded, so + # vptr checks work properly + # + # If you're using this setting, you must verify that all the libraries + # you load consistently use the same libstdc++, or you may have + # mysterious segfaults. + # + import os as _dl_flags if not hasattr(_dl_flags, 'RTLD_GLOBAL') or not hasattr(_dl_flags, 'RTLD_LAZY'): try: # next try if DLFCN exists @@ -72,22 +90,26 @@ else: except ImportError: # as a last attempt, use compile-time constants import torch._dl as _dl_flags - old_flags = sys.getdlopenflags() sys.setdlopenflags(_dl_flags.RTLD_GLOBAL | _dl_flags.RTLD_LAZY) + from torch._C import * + sys.setdlopenflags(old_flags) + del old_flags + del _dl_flags -del _dl_flags - -from torch._C import * +else: + # Easy way. You want this most of the time, because it will prevent + # C++ symbols from libtorch clobbering C++ symbols from other + # libraries, leading to mysterious segfaults. + # + # See Note [Global dependencies] + _load_global_deps() + from torch._C import * __all__ += [name for name in dir(_C) if name[0] != '_' and not name.endswith('Base')] -if platform.system() != 'Windows': - sys.setdlopenflags(old_flags) - del old_flags - ################################################################################ # Define basic utilities ################################################################################ diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py index 43aa5802b48..c289e9952bf 100644 --- a/torch/_utils_internal.py +++ b/torch/_utils_internal.py @@ -54,3 +54,4 @@ def get_source_lines_and_file(obj): TEST_MASTER_ADDR = '127.0.0.1' TEST_MASTER_PORT = 29500 +USE_RTLD_GLOBAL_WITH_LIBTORCH = False diff --git a/torch/csrc/empty.c b/torch/csrc/empty.c new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ubsan.supp b/ubsan.supp index 5a826e920b4..62e64b785b9 100644 --- a/ubsan.supp +++ b/ubsan.supp @@ -1,3 +1 @@ -vptr:libtorch.so vptr:libtorch_python.so -vptr:libcaffe2.so