Split cuda: list cpp files that go in _cu library explicitly (#69082)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/69082

Test Plan: Imported from OSS

Reviewed By: dagitses, mruberry

Differential Revision: D32723669

Pulled By: malfet

fbshipit-source-id: e9a815b882089dcf1ba76194c728cd7c45377deb
(cherry picked from commit 456d1ebeb8)
This commit is contained in:
Peter Bell 2022-01-31 18:40:49 -08:00 committed by PyTorch MergeBot
parent e784808bc6
commit bb456d2bf7
4 changed files with 103 additions and 78 deletions

View File

@ -21,7 +21,8 @@ set(ATen_CPU_SRCS)
set(ATen_CPU_TEST_SRCS)
set(ATen_CPU_INCLUDE)
set(ATen_THIRD_PARTY_INCLUDE)
set(ATen_CUDA_SRCS)
set(ATen_CUDA_CPP_SRCS)
set(ATen_CUDA_CU_SRCS)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY)
set(ATen_CUDA_TEST_SRCS)
set(ATen_CUDA_INCLUDE)
@ -95,8 +96,10 @@ add_subdirectory(src/ATen)
# Pass source, includes, and libs to parent
set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)

View File

@ -113,9 +113,7 @@ file(GLOB native_ao_sparse_h
file(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
file(GLOB native_cpu_h "native/cpu/*.h")
file(GLOB native_cuda_cu_sp "native/cuda/Unique.cu" "native/cuda/TensorFactories.cu")
file(GLOB native_cuda_cu "native/cuda/*.cu")
exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp})
file(GLOB native_cuda_cpp "native/cuda/*.cpp")
file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh")
@ -189,13 +187,35 @@ endif()
if(USE_CUDA)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda)
set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu} ${native_quantized_cuda_cu})
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${native_cuda_cu_sp})
set(all_cuda_cpp ${native_sparse_cuda_cpp} ${native_quantized_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_sources} ${ATen_CUDA_SRCS})
set(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${all_cuda_cpp})
list(APPEND ATen_CUDA_CU_SRCS
${cuda_cu}
${native_cuda_cu}
${native_sparse_cuda_cu}
${native_quantized_cuda_cu}
${cuda_generated_sources}
)
list(APPEND ATen_CUDA_CPP_SRCS
${cuda_cpp}
${native_cuda_cpp}
${native_cudnn_cpp}
${native_miopen_cpp}
${native_quantized_cuda_cpp}
${native_sparse_cuda_cpp}
)
if(CAFFE2_USE_CUDNN)
set(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp})
list(APPEND ATen_CUDA_CPP_SRCS ${cudnn_cpp})
endif()
append_filelist("aten_cuda_cu_source_list" ATen_CUDA_CU_SRCS)
append_filelist("aten_cuda_with_sort_by_key_source_list" ATen_CUDA_SRCS_W_SORT_BY_KEY)
append_filelist("aten_cuda_cu_with_sort_by_key_source_list" ATen_CUDA_CU_SRCS_W_SORT_BY_KEY)
exclude(ATen_CUDA_CPP_SRCS "${ATen_CUDA_CPP_SRCS}"
${ATen_CUDA_CU_SRCS}
${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY})
exclude(ATen_CUDA_CU_SRCS "${ATen_CUDA_CU_SRCS}"
${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY})
endif()
if(USE_ROCM)
@ -428,7 +448,6 @@ set(ATen_CPU_SRCS ${all_cpu_cpp})
list(APPEND ATen_CPU_DEPENDENCY_LIBS ATEN_CPU_FILES_GEN_LIB)
if(USE_CUDA)
set(ATen_CUDA_SRCS ${all_cuda_cpp})
set(ATen_NVRTC_STUB_SRCS ${cuda_nvrtc_stub_cpp})
list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
endif()
@ -512,8 +531,10 @@ list(APPEND ATen_MOBILE_BENCHMARK_SRCS
# Pass source, includes, and libs to parent
set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)

View File

@ -84,8 +84,10 @@ if(INTERN_BUILD_ATEN_OPS)
# Add source, includes, and libs to lists
list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS})
list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_CPP_SRCS})
list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY})
list(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS})
list(APPEND Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY})
list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS})
list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS_W_SORT_BY_KEY})
list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
@ -171,68 +173,37 @@ endif()
if(CAFFE2_ALLOWLISTED_FILES)
caffe2_do_allowlist(Caffe2_CPU_SRCS CAFFE2_ALLOWLISTED_FILES)
caffe2_do_allowlist(Caffe2_GPU_SRCS CAFFE2_ALLOWLISTED_FILES)
caffe2_do_allowlist(Caffe2_GPU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES)
caffe2_do_allowlist(Caffe2_GPU_CU_SRCS CAFFE2_ALLOWLISTED_FILES)
caffe2_do_allowlist(Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES)
caffe2_do_allowlist(Caffe2_HIP_SRCS CAFFE2_ALLOWLISTED_FILES)
endif()
if(BUILD_SPLIT_CUDA)
# Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp
foreach(tmp ${Caffe2_GPU_SRCS})
if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort|TensorTopK|TensorModeKernel|IndexKernel|jit_utils).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
# Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files
list(APPEND Caffe2_GPU_SRCS_CU ${tmp})
else()
list(APPEND Caffe2_GPU_SRCS_CPP ${tmp})
endif()
endforeach()
foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
if("${tmp}" MATCHES ".*aten.*\\.cu" AND NOT "${tmp}" MATCHES ".*TensorFactories.*")
list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU ${tmp})
else()
list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP ${tmp})
endif()
endforeach()
endif()
if(PRINT_CMAKE_DEBUG_INFO)
message(STATUS "CPU sources: ")
foreach(tmp ${Caffe2_CPU_SRCS})
message(STATUS " " ${tmp})
endforeach()
message(STATUS "GPU sources: ")
message(STATUS "GPU sources: (for torch_cuda_cpp)")
foreach(tmp ${Caffe2_GPU_SRCS})
message(STATUS " " ${tmp})
endforeach()
if(BUILD_SPLIT_CUDA)
message(STATUS "GPU sources: (for torch_cuda_cpp)")
foreach(tmp ${Caffe2_GPU_SRCS_CPP})
message(STATUS " " ${tmp})
endforeach()
message(STATUS "GPU sources: (for torch_cuda_cu)")
foreach(tmp ${Caffe2_GPU_SRCS_CU})
message(STATUS " " ${tmp})
endforeach()
endif()
message(STATUS "GPU sources (w/ sort by key): ")
foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
foreach(tmp ${Caffe2_GPU_CU_SRCS})
message(STATUS " " ${tmp})
endforeach()
if(BUILD_SPLIT_CUDA)
message(STATUS "torch_cuda_cu GPU sources (w/ sort by key): ")
foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU})
foreach(tmp ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
message(STATUS " " ${tmp})
endforeach()
message(STATUS "torch_cuda_cpp GPU sources (w/ sort by key): ")
foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP})
foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
message(STATUS " " ${tmp})
endforeach()
endif()
message(STATUS "CPU include: ")
foreach(tmp ${Caffe2_CPU_INCLUDE})
@ -626,11 +597,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
endif()
if(USE_CUDA)
if(BUILD_SPLIT_CUDA)
list(APPEND Caffe2_GPU_SRCS_CU ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
else()
list(APPEND Caffe2_GPU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
endif()
list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
if(MSVC)
# Delay load nvcuda.dll so we can import torch compiled with cuda on a CPU-only machine
@ -641,22 +608,14 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
if(USE_NCCL AND BUILD_SPLIT_CUDA)
list(APPEND Caffe2_GPU_SRCS_CPP
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
elseif(USE_NCCL)
if(USE_NCCL)
list(APPEND Caffe2_GPU_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif()
if(USE_DISTRIBUTED)
if(BUILD_SPLIT_CUDA)
set(_target "Caffe2_GPU_SRCS_CPP")
else()
set(_target "Caffe2_GPU_SRCS")
endif()
append_filelist("libtorch_cuda_distributed_base_sources" ${_target})
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" ${_target})
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
endif()
endif()
set_source_files_properties(
@ -904,17 +863,21 @@ elseif(USE_CUDA)
if(CUDA_SEPARABLE_COMPILATION)
# Separate compilation fails when kernels using `thrust::sort_by_key`
# are linked with the rest of CUDA code. Workaround by linking them separately.
add_library(torch_cuda ${Caffe2_GPU_SRCS})
add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_CU_SRCS})
set_property(TARGET torch_cuda PROPERTY CUDA_SEPARABLE_COMPILATION ON)
add_library(torch_cuda_w_sort_by_key OBJECT ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
add_library(torch_cuda_w_sort_by_key OBJECT
${Caffe2_GPU_SRCS_W_SORT_BY_KEY}
${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF)
target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key)
elseif(BUILD_SPLIT_CUDA)
add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS_CPP} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP})
add_library(torch_cuda_cu ${Caffe2_GPU_SRCS_CU} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU})
add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
add_library(torch_cuda_cu ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
else()
add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
add_library(torch_cuda
${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}
${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
endif()
set(CUDA_LINK_LIBRARIES_KEYWORD)
if(BUILD_SPLIT_CUDA)

View File

@ -1314,3 +1314,41 @@ aten_native_source_non_codegen_list = [
# TODO: move the exceptions to proper locations
# 2. The whole aten native source list includes the list with and without aten codegen process.
aten_native_source_list = sorted(aten_native_source_non_codegen_list + aten_native_source_codegen_list)
# These are cpp files which need to go in the torch_cuda_cu library
# .cu files can be found via glob
aten_cuda_cu_source_list = [
"aten/src/ATen/cuda/CUDABlas.cpp",
"aten/src/ATen/cuda/CUDASolver.cpp",
"aten/src/ATen/cuda/CUDASparseBlas.cpp",
"aten/src/ATen/cuda/CublasHandlePool.cpp",
"aten/src/ATen/cuda/CusolverDnHandlePool.cpp",
"aten/src/ATen/native/cuda/Activation.cpp",
"aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp",
"aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp",
"aten/src/ATen/native/cuda/Blas.cpp",
"aten/src/ATen/native/cuda/Equal.cpp",
"aten/src/ATen/native/cuda/IndexKernel.cpp",
"aten/src/ATen/native/cuda/ReduceOps.cpp",
"aten/src/ATen/native/cuda/ScanKernels.cpp",
"aten/src/ATen/native/cuda/Sort.cpp",
"aten/src/ATen/native/cuda/Sorting.cpp",
"aten/src/ATen/native/cuda/TensorModeKernel.cpp",
"aten/src/ATen/native/cuda/TensorShapeCUDA.cpp",
"aten/src/ATen/native/cuda/TensorTopK.cpp",
"aten/src/ATen/native/cuda/jit_utils.cpp",
"aten/src/ATen/native/sparse/cuda/SparseBlas.cpp",
"aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp",
"aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp",
"aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp",
]
# Files using thrust::sort_by_key need to be linked last
aten_cuda_with_sort_by_key_source_list = [
# empty_cuda is needed by torch_cuda_cpp
"aten/src/ATen/native/cuda/TensorFactories.cu",
]
aten_cuda_cu_with_sort_by_key_source_list = [
"aten/src/ATen/native/cuda/Unique.cu",
]