Revert "Revert "Nvfuser code removal (#111093)"" (#111604)

This reverts commit 715dfced72.

The original PR #111093 is reverted due to broken internal build.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/111604
Approved by: https://github.com/davidberard98
This commit is contained in:
jjsjann123 2023-10-23 18:32:41 +00:00 committed by PyTorch MergeBot
parent ce48d36324
commit 39c09d4da6
12 changed files with 4 additions and 188 deletions

View File

@ -615,7 +615,7 @@ test_libtorch_jit() {
# Run jit and lazy tensor cpp tests together to finish them faster
if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/nvfuser_tests cpp/test_lazy
LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
else
# CUDA tests have already been skipped when CUDA is not available
python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"

View File

@ -197,9 +197,6 @@ option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON)
cmake_dependent_option(
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
cmake_dependent_option(
BUILD_NVFUSER "Build NVFUSER" ON
"USE_CUDA OR USE_ROCM" OFF)
cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
cmake_dependent_option(
@ -1206,19 +1203,6 @@ if(BUILD_JNI)
add_subdirectory(android/pytorch_android)
endif()
if(NOT USE_CUDA AND NOT USE_ROCM)
set(BUILD_NVFUSER OFF CACHE BOOL "BUILD nvfuser" FORCE)
endif()
if(BUILD_NVFUSER)
if(DEFINED ENV{NVFUSER_SOURCE_DIR})
add_subdirectory($ENV{NVFUSER_SOURCE_DIR} nvfuser)
else()
add_subdirectory(third_party/nvfuser nvfuser)
endif()
add_compile_definitions(BUILD_NVFUSER)
endif()
include(cmake/Summary.cmake)
caffe2_print_configuration_summary()

View File

@ -255,7 +255,6 @@ core_sources_full_mobile_no_backend_interface_xplat = [
"torch/csrc/jit/passes/constant_propagation.cpp",
"torch/csrc/jit/passes/restore_mutation.cpp",
"torch/csrc/jit/passes/create_autodiff_subgraphs.cpp",
"torch/csrc/jit/passes/cuda_graph_fuser.cpp",
"torch/csrc/jit/passes/dead_code_elimination.cpp",
"torch/csrc/jit/passes/eliminate_no_ops.cpp",
"torch/csrc/jit/passes/remove_redundant_profiles.cpp",

View File

@ -189,9 +189,6 @@
# NCCL_INCLUDE_DIR
# specify where nccl is installed
#
# NVFUSER_SOURCE_DIR
# specify nvfuser root directory
#
# NVTOOLSEXT_PATH (Windows only)
# specify where nvtoolsext is installed
#
@ -632,11 +629,6 @@ class build_ext(setuptools.command.build_ext.build_ext):
else:
report("-- Not using ITT")
if cmake_cache_vars["BUILD_NVFUSER"]:
report("-- Building nvfuser")
else:
report("-- Not Building nvfuser")
# Do not use clang to compile extensions if `-fstack-clash-protection` is defined
# in system CFLAGS
c_flags = str(os.getenv("CFLAGS", ""))
@ -736,22 +728,6 @@ class build_ext(setuptools.command.build_ext.build_ext):
os.makedirs(dst_dir)
self.copy_file(src, dst)
# Copy nvfuser extension
for i, ext in enumerate(self.extensions):
if ext.name != "nvfuser._C":
continue
fullname = self.get_ext_fullname(ext.name)
filename = self.get_ext_filename(fullname)
fileext = os.path.splitext(filename)[1]
src = os.path.join(os.path.dirname(filename), "nvfuser" + fileext)
dst = os.path.join(os.path.realpath(self.build_lib), filename)
if os.path.exists(src):
report(f"Copying {ext.name} from {src} to {dst}")
dst_dir = os.path.dirname(dst)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
self.copy_file(src, dst)
setuptools.command.build_ext.build_ext.build_extensions(self)
def get_outputs(self):
@ -1011,8 +987,6 @@ def configure_extension_build():
excludes.extend(["caffe2", "caffe2.*"])
if not cmake_cache_vars["BUILD_FUNCTORCH"]:
excludes.extend(["functorch", "functorch.*"])
if not cmake_cache_vars["BUILD_NVFUSER"]:
excludes.extend(["nvfuser", "nvfuser.*"])
packages = find_packages(exclude=excludes)
C = Extension(
"torch._C",
@ -1046,10 +1020,6 @@ def configure_extension_build():
extensions.append(
Extension(name="functorch._C", sources=[]),
)
if cmake_cache_vars["BUILD_NVFUSER"]:
extensions.append(
Extension(name="nvfuser._C", sources=[]),
)
cmdclass = {
"bdist_wheel": wheel_concatenate,
@ -1312,8 +1282,6 @@ def main():
"include/torch/csrc/jit/tensorexpr/*.h",
"include/torch/csrc/jit/tensorexpr/operators/*.h",
"include/torch/csrc/jit/codegen/cuda/*.h",
"include/torch/csrc/jit/codegen/cuda/ops/*.h",
"include/torch/csrc/jit/codegen/cuda/scheduler/*.h",
"include/torch/csrc/onnx/*.h",
"include/torch/csrc/profiler/*.h",
"include/torch/csrc/profiler/orchestration/*.h",
@ -1355,18 +1323,6 @@ def main():
"utils/model_dump/code.js",
"utils/model_dump/*.mjs",
]
if get_cmake_cache_vars()["BUILD_NVFUSER"]:
torch_package_data.extend(
[
"share/cmake/nvfuser/*.cmake",
"include/nvfuser/*.h",
"include/nvfuser/kernel_db/*.h",
"include/nvfuser/multidevice/*.h",
"include/nvfuser/ops/*.h",
"include/nvfuser/python_frontend/*.h",
"include/nvfuser/scheduler/*.h",
]
)
if get_cmake_cache_vars()["BUILD_CAFFE2"]:
torch_package_data.extend(

View File

@ -14,43 +14,10 @@ namespace jit {
namespace fuser {
namespace cuda {
class LoadingNvfuserLibrary {
public:
#ifdef USE_CUDA
LoadingNvfuserLibrary() {
std::string library_name;
if (const char* path = std::getenv("TORCH_NVFUSER_LIBRARY_PATH")) {
library_name = path;
}
#if defined(_WIN32)
library_name += "nvfuser_codegen.dll";
#elif defined(__APPLE__)
library_name += "libnvfuser_codegen.dylib";
#else
library_name += "libnvfuser_codegen.so";
#endif
try {
// NOTE: we need to refactor this to a lazy load instead. We could end up
// with double de-allocation with our python API loading the library.
// Leaking the handle should solve the problem for now
nvfuserLib_ = std::make_shared<at::DynamicLibrary>(
library_name.c_str(), nullptr, true);
} catch (const c10::DynamicLibraryError& e) {
#if defined(BUILD_NVFUSER) || !defined(NDEBUG)
TORCH_WARN_ONCE("Loading nvfuser library failed with: ", e.msg());
#endif
}
}
#endif // USE_CUDA
std::shared_ptr<at::DynamicLibrary> nvfuserLib_;
};
static LoadingNvfuserLibrary loading_nvfuser_library_;
static std::atomic<bool> cuda_fusion_guard_mode{true};
bool isEnabled() {
TORCH_WARN_ONCE("torch::jit::fuser::cuda::isEnabled() is deprecated");
return false;
}

View File

@ -7,7 +7,6 @@
#include <c10/util/Optional.h>
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
#include <torch/csrc/jit/passes/quantization/helper.h>
#include <stack>

View File

@ -1,21 +0,0 @@
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
#include <mutex>
namespace torch {
namespace jit {
static CudaFuserComparisonCallback comparison_callback = {false, nullptr};
static std::mutex comparison_callback_lock;
CudaFuserComparisonCallback getCudaFuserComparisonCallback() {
std::lock_guard<std::mutex> guard(comparison_callback_lock);
return comparison_callback;
}
void setCudaFuserComparisonCallback(CudaFuserComparisonCallback callback) {
std::lock_guard<std::mutex> guard(comparison_callback_lock);
comparison_callback = callback;
}
} // namespace jit
} // namespace torch

View File

@ -1,42 +0,0 @@
#pragma once
#include <ATen/Context.h>
#include <torch/csrc/jit/codegen/cuda/interface.h>
#include <torch/csrc/jit/ir/ir.h>
#include <torch/csrc/jit/passes/pass_manager.h>
#include <string>
#include <utility>
namespace torch {
namespace jit {
// Register CudaFuseGraph in custom passes
struct TORCH_API RegisterCudaFuseGraph
: public PassManager<RegisterCudaFuseGraph> {
static bool registerPass(bool enabled) {
TORCH_WARN(
"RegisterCudaFuseGraph::registerPass() is deprecated. "
"Please use torch::jit::fuser::cuda::setEnabled().");
return fuser::cuda::setEnabled(enabled);
}
static bool isRegistered() {
TORCH_WARN(
"RegisterCudaFuseGraph::isRegistered() is deprecated. "
"Please use torch::jit::fuser::cuda::isEnabled().");
return fuser::cuda::isEnabled();
}
};
struct CudaFuserComparisonCallback {
using callback_type =
std::function<void(const Stack&, const Stack&, const std::string&)>;
bool run_fallback;
callback_type callback;
};
TORCH_API CudaFuserComparisonCallback getCudaFuserComparisonCallback();
TORCH_API void setCudaFuserComparisonCallback(CudaFuserComparisonCallback);
} // namespace jit
} // namespace torch

View File

@ -857,11 +857,6 @@ class TensorExprFuser {
if (device->is_cpu()) {
return canFuseOnCPU();
} else if (device->is_cuda()) {
#ifndef C10_MOBILE
if (fuser::cuda::isEnabled()) {
return false;
}
#endif
return canFuseOnGPU();
} else if (device->is_xpu()) {
return false;

View File

@ -27,7 +27,6 @@
#include <torch/csrc/jit/passes/constant_propagation.h>
#include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
#include <torch/csrc/jit/passes/create_functional_graphs.h>
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
#include <torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h>
#include <torch/csrc/jit/passes/dead_code_elimination.h>
#include <torch/csrc/jit/passes/decompose_ops.h>

View File

@ -14,7 +14,6 @@
#include <torch/csrc/jit/passes/constant_pooling.h>
#include <torch/csrc/jit/passes/constant_propagation.h>
#include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
#include <torch/csrc/jit/passes/dead_code_elimination.h>
#include <torch/csrc/jit/passes/decompose_ops.h>
#include <torch/csrc/jit/passes/graph_fuser.h>
@ -646,13 +645,6 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
// before any other pass that could insert `prim::iprofile_value` node on
// `aten::_grad_sum_to_size` input.
InsertProfileNodesForSpecializeAutogradZero(pr_.get());
// `InsertProfileNodesForCUDAFuser` inserts profile node for non-tensor
// value
#ifndef C10_MOBILE
if (torch::jit::fuser::cuda::isEnabled()) {
torch::jit::fuser::cuda::InsertProfileNodesForCUDAFuser(pr_.get());
}
#endif
GRAPH_DUMP("Profiled Graph: ", pr_->graph());
profiling_plan_ = ExecutionPlan(pr_->graph(), function_name_);
// fall-through

View File

@ -207,13 +207,7 @@ void ProfilingRecord::insertShapeProfile(
}
static bool needsProfiledInputs(Node* n) {
if (tensorexpr::isSupported(n) ||
#ifndef C10_MOBILE
(fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
#else
false
#endif
) {
if (tensorexpr::isSupported(n)) {
return true;
}
@ -244,13 +238,7 @@ static bool needsProfiledInputs(Node* n) {
}
static bool needsProfiledOutput(Node* n) {
if (tensorexpr::isSupported(n) ||
#ifndef C10_MOBILE
(fuser::cuda::isEnabled() && fuser::cuda::profileNode(n))
#else
false
#endif
) {
if (tensorexpr::isSupported(n)) {
return true;
}