Revert "Nvfuser code removal (#111093)"

This reverts commit 572628e520. Reverted https://github.com/pytorch/pytorch/pull/111093 on behalf of https://github.com/jeanschmidt due to Breaking internal builds, @albanD please help to support the author with the next steps to get this diff merged ([comment](https://github.com/pytorch/pytorch/pull/111093#issuecomment-1771434853))
2025-12-06 12:20:52 +01:00 · 2023-10-19 17:39:49 +00:00 · 2023-10-19 17:39:49 +00:00 · 715dfced72
commit 715dfced72
parent ca5f6f7af3
12 changed files with 188 additions and 4 deletions
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -615,7 +615,7 @@ test_libtorch_jit() {
  # Run jit and lazy tensor cpp tests together to finish them faster
  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
-    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
+    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/nvfuser_tests cpp/test_lazy
  else
    # CUDA tests have already been skipped when CUDA is not available
    python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -197,6 +197,9 @@ option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
     BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
 cmake_dependent_option(
    BUILD_NVFUSER "Build NVFUSER" ON
    "USE_CUDA OR USE_ROCM" OFF)
 cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(
@ -1203,6 +1206,19 @@ if(BUILD_JNI)
  add_subdirectory(android/pytorch_android)
 endif()
 if(NOT USE_CUDA AND NOT USE_ROCM)
  set(BUILD_NVFUSER OFF CACHE BOOL "BUILD nvfuser" FORCE)
 endif()
 if(BUILD_NVFUSER)
  if(DEFINED ENV{NVFUSER_SOURCE_DIR})
    add_subdirectory($ENV{NVFUSER_SOURCE_DIR} nvfuser)
  else()
    add_subdirectory(third_party/nvfuser nvfuser)
  endif()
  add_compile_definitions(BUILD_NVFUSER)
 endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -255,6 +255,7 @@ core_sources_full_mobile_no_backend_interface_xplat = [
    "torch/csrc/jit/passes/constant_propagation.cpp",
    "torch/csrc/jit/passes/restore_mutation.cpp",
    "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp",
    "torch/csrc/jit/passes/cuda_graph_fuser.cpp",
    "torch/csrc/jit/passes/dead_code_elimination.cpp",
    "torch/csrc/jit/passes/eliminate_no_ops.cpp",
    "torch/csrc/jit/passes/remove_redundant_profiles.cpp",
--- a/setup.py
+++ b/setup.py
@ -186,6 +186,9 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
 #   NVFUSER_SOURCE_DIR
 #     specify nvfuser root directory
 #
 #   NVTOOLSEXT_PATH (Windows only)
 #     specify where nvtoolsext is installed
 #
@ -626,6 +629,11 @@ class build_ext(setuptools.command.build_ext.build_ext):
        else:
            report("-- Not using ITT")
        if cmake_cache_vars["BUILD_NVFUSER"]:
            report("-- Building nvfuser")
        else:
            report("-- Not Building nvfuser")
        # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
        # in system CFLAGS
        c_flags = str(os.getenv("CFLAGS", ""))
@ -725,6 +733,22 @@ class build_ext(setuptools.command.build_ext.build_ext):
                    os.makedirs(dst_dir)
                self.copy_file(src, dst)
        # Copy nvfuser extension
        for i, ext in enumerate(self.extensions):
            if ext.name != "nvfuser._C":
                continue
            fullname = self.get_ext_fullname(ext.name)
            filename = self.get_ext_filename(fullname)
            fileext = os.path.splitext(filename)[1]
            src = os.path.join(os.path.dirname(filename), "nvfuser" + fileext)
            dst = os.path.join(os.path.realpath(self.build_lib), filename)
            if os.path.exists(src):
                report(f"Copying {ext.name} from {src} to {dst}")
                dst_dir = os.path.dirname(dst)
                if not os.path.exists(dst_dir):
                    os.makedirs(dst_dir)
                self.copy_file(src, dst)
        setuptools.command.build_ext.build_ext.build_extensions(self)
    def get_outputs(self):
@ -990,6 +1014,8 @@ def configure_extension_build():
        excludes.extend(["caffe2", "caffe2.*"])
    if not cmake_cache_vars["BUILD_FUNCTORCH"]:
        excludes.extend(["functorch", "functorch.*"])
    if not cmake_cache_vars["BUILD_NVFUSER"]:
        excludes.extend(["nvfuser", "nvfuser.*"])
    packages = find_packages(exclude=excludes)
    C = Extension(
        "torch._C",
@ -1023,6 +1049,10 @@ def configure_extension_build():
        extensions.append(
            Extension(name="functorch._C", sources=[]),
        )
    if cmake_cache_vars["BUILD_NVFUSER"]:
        extensions.append(
            Extension(name="nvfuser._C", sources=[]),
        )
    cmdclass = {
        "bdist_wheel": wheel_concatenate,
@ -1284,6 +1314,8 @@ def main():
        "include/torch/csrc/jit/tensorexpr/*.h",
        "include/torch/csrc/jit/tensorexpr/operators/*.h",
        "include/torch/csrc/jit/codegen/cuda/*.h",
        "include/torch/csrc/jit/codegen/cuda/ops/*.h",
        "include/torch/csrc/jit/codegen/cuda/scheduler/*.h",
        "include/torch/csrc/onnx/*.h",
        "include/torch/csrc/profiler/*.h",
        "include/torch/csrc/profiler/orchestration/*.h",
@ -1325,6 +1357,18 @@ def main():
        "utils/model_dump/code.js",
        "utils/model_dump/*.mjs",
    ]
    if get_cmake_cache_vars()["BUILD_NVFUSER"]:
        torch_package_data.extend(
            [
                "share/cmake/nvfuser/*.cmake",
                "include/nvfuser/*.h",
                "include/nvfuser/kernel_db/*.h",
                "include/nvfuser/multidevice/*.h",
                "include/nvfuser/ops/*.h",
                "include/nvfuser/python_frontend/*.h",
                "include/nvfuser/scheduler/*.h",
            ]
        )
    if get_cmake_cache_vars()["BUILD_CAFFE2"]:
        torch_package_data.extend(
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@ -14,10 +14,43 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 class LoadingNvfuserLibrary {
 public:
 #ifdef USE_CUDA
  LoadingNvfuserLibrary() {
    std::string library_name;
    if (const char* path = std::getenv("TORCH_NVFUSER_LIBRARY_PATH")) {
      library_name = path;
    }
 #if defined(_WIN32)
    library_name += "nvfuser_codegen.dll";
 #elif defined(__APPLE__)
    library_name += "libnvfuser_codegen.dylib";
 #else
    library_name += "libnvfuser_codegen.so";
 #endif
    try {
      // NOTE: we need to refactor this to a lazy load instead. We could end up
      // with double de-allocation with our python API loading the library.
      // Leaking the handle should solve the problem for now
      nvfuserLib_ = std::make_shared<at::DynamicLibrary>(
          library_name.c_str(), nullptr, true);
    } catch (const c10::DynamicLibraryError& e) {
 #if defined(BUILD_NVFUSER) || !defined(NDEBUG)
      TORCH_WARN_ONCE("Loading nvfuser library failed with: ", e.msg());
 #endif
    }
  }
 #endif // USE_CUDA
  std::shared_ptr<at::DynamicLibrary> nvfuserLib_;
 };
 static LoadingNvfuserLibrary loading_nvfuser_library_;
 static std::atomic<bool> cuda_fusion_guard_mode{true};
 bool isEnabled() {
  TORCH_WARN_ONCE("torch::jit::fuser::cuda::isEnabled() is deprecated");
  return false;
 }
--- a/torch/csrc/jit/passes/autocast.cpp
+++ b/torch/csrc/jit/passes/autocast.cpp
@ -7,6 +7,7 @@
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/quantization/helper.h>
 #include <stack>
--- a/torch/csrc/jit/passes/cuda_graph_fuser.cpp
+++ b/torch/csrc/jit/passes/cuda_graph_fuser.cpp
@ -0,0 +1,21 @@
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <mutex>
 namespace torch {
 namespace jit {
 static CudaFuserComparisonCallback comparison_callback = {false, nullptr};
 static std::mutex comparison_callback_lock;
 CudaFuserComparisonCallback getCudaFuserComparisonCallback() {
  std::lock_guard<std::mutex> guard(comparison_callback_lock);
  return comparison_callback;
 }
 void setCudaFuserComparisonCallback(CudaFuserComparisonCallback callback) {
  std::lock_guard<std::mutex> guard(comparison_callback_lock);
  comparison_callback = callback;
 }
 } // namespace jit
 } // namespace torch
--- a/torch/csrc/jit/passes/cuda_graph_fuser.h
+++ b/torch/csrc/jit/passes/cuda_graph_fuser.h
@ -0,0 +1,42 @@
 #pragma once
 #include <ATen/Context.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include <string>
 #include <utility>
 namespace torch {
 namespace jit {
 // Register CudaFuseGraph in custom passes
 struct TORCH_API RegisterCudaFuseGraph
    : public PassManager<RegisterCudaFuseGraph> {
  static bool registerPass(bool enabled) {
    TORCH_WARN(
        "RegisterCudaFuseGraph::registerPass() is deprecated. "
        "Please use torch::jit::fuser::cuda::setEnabled().");
    return fuser::cuda::setEnabled(enabled);
  }
  static bool isRegistered() {
    TORCH_WARN(
        "RegisterCudaFuseGraph::isRegistered() is deprecated. "
        "Please use torch::jit::fuser::cuda::isEnabled().");
    return fuser::cuda::isEnabled();
  }
 };
 struct CudaFuserComparisonCallback {
  using callback_type =
      std::function<void(const Stack&, const Stack&, const std::string&)>;
  bool run_fallback;
  callback_type callback;
 };
 TORCH_API CudaFuserComparisonCallback getCudaFuserComparisonCallback();
 TORCH_API void setCudaFuserComparisonCallback(CudaFuserComparisonCallback);
 } // namespace jit
 } // namespace torch
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@ -857,6 +857,11 @@ class TensorExprFuser {
    if (device->is_cpu()) {
      return canFuseOnCPU();
    } else if (device->is_cuda()) {
 #ifndef C10_MOBILE
      if (fuser::cuda::isEnabled()) {
        return false;
      }
 #endif
      return canFuseOnGPU();
    } else if (device->is_xpu()) {
      return false;
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@ -27,6 +27,7 @@
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/create_functional_graphs.h>
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@ -14,6 +14,7 @@
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
 #include <torch/csrc/jit/passes/graph_fuser.h>
@ -645,6 +646,13 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
    // before any other pass that could insert `prim::iprofile_value` node on
    // `aten::_grad_sum_to_size` input.
    InsertProfileNodesForSpecializeAutogradZero(pr_.get());
    // `InsertProfileNodesForCUDAFuser` inserts profile node for non-tensor
    // value
 #ifndef C10_MOBILE
    if (torch::jit::fuser::cuda::isEnabled()) {
      torch::jit::fuser::cuda::InsertProfileNodesForCUDAFuser(pr_.get());
    }
 #endif
    GRAPH_DUMP("Profiled Graph: ", pr_->graph());
    profiling_plan_ = ExecutionPlan(pr_->graph(), function_name_);
    // fall-through
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@ -207,7 +207,13 @@ void ProfilingRecord::insertShapeProfile(
 }
 static bool needsProfiledInputs(Node* n) {
-  if (tensorexpr::isSupported(n)) {
+  if (tensorexpr::isSupported(n) ||
 #ifndef C10_MOBILE
      (fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
 #else
      false
 #endif
  ) {
    return true;
  }
@ -238,7 +244,13 @@ static bool needsProfiledInputs(Node* n) {
 }
 static bool needsProfiledOutput(Node* n) {
-  if (tensorexpr::isSupported(n)) {
+  if (tensorexpr::isSupported(n) ||
 #ifndef C10_MOBILE
      (fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
 #else
      false
 #endif
  ) {
    return true;
  }