diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index d0553b71b16..44167283484 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -21,7 +21,8 @@ set(ATen_CPU_SRCS) set(ATen_CPU_TEST_SRCS) set(ATen_CPU_INCLUDE) set(ATen_THIRD_PARTY_INCLUDE) -set(ATen_CUDA_SRCS) +set(ATen_CUDA_CPP_SRCS) +set(ATen_CUDA_CU_SRCS) set(ATen_CUDA_SRCS_W_SORT_BY_KEY) set(ATen_CUDA_TEST_SRCS) set(ATen_CUDA_INCLUDE) @@ -95,8 +96,10 @@ add_subdirectory(src/ATen) # Pass source, includes, and libs to parent set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) -set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) +set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) +set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 607336d63a9..926a4e41d5a 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -113,9 +113,7 @@ file(GLOB native_ao_sparse_h file(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h") file(GLOB native_cpu_h "native/cpu/*.h") -file(GLOB native_cuda_cu_sp "native/cuda/Unique.cu" "native/cuda/TensorFactories.cu") file(GLOB native_cuda_cu "native/cuda/*.cu") -exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp}) file(GLOB native_cuda_cpp "native/cuda/*.cpp") file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh") @@ -189,13 +187,35 @@ endif() if(USE_CUDA) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda) - set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu} ${native_sparse_cuda_cu} ${native_quantized_cuda_cu}) - set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${native_cuda_cu_sp}) - set(all_cuda_cpp ${native_sparse_cuda_cpp} ${native_quantized_cuda_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_sources} ${ATen_CUDA_SRCS}) - set(all_cuda_cpp ${native_cudnn_cpp} ${native_miopen_cpp} ${all_cuda_cpp}) + + list(APPEND ATen_CUDA_CU_SRCS + ${cuda_cu} + ${native_cuda_cu} + ${native_sparse_cuda_cu} + ${native_quantized_cuda_cu} + ${cuda_generated_sources} + ) + list(APPEND ATen_CUDA_CPP_SRCS + ${cuda_cpp} + ${native_cuda_cpp} + ${native_cudnn_cpp} + ${native_miopen_cpp} + ${native_quantized_cuda_cpp} + ${native_sparse_cuda_cpp} + ) if(CAFFE2_USE_CUDNN) - set(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp}) + list(APPEND ATen_CUDA_CPP_SRCS ${cudnn_cpp}) endif() + + append_filelist("aten_cuda_cu_source_list" ATen_CUDA_CU_SRCS) + append_filelist("aten_cuda_with_sort_by_key_source_list" ATen_CUDA_SRCS_W_SORT_BY_KEY) + append_filelist("aten_cuda_cu_with_sort_by_key_source_list" ATen_CUDA_CU_SRCS_W_SORT_BY_KEY) + + exclude(ATen_CUDA_CPP_SRCS "${ATen_CUDA_CPP_SRCS}" + ${ATen_CUDA_CU_SRCS} + ${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY}) + exclude(ATen_CUDA_CU_SRCS "${ATen_CUDA_CU_SRCS}" + ${ATen_CUDA_SRCS_W_SORT_BY_KEY} ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY}) endif() if(USE_ROCM) @@ -428,7 +448,6 @@ set(ATen_CPU_SRCS ${all_cpu_cpp}) list(APPEND ATen_CPU_DEPENDENCY_LIBS ATEN_CPU_FILES_GEN_LIB) if(USE_CUDA) - set(ATen_CUDA_SRCS ${all_cuda_cpp}) set(ATen_NVRTC_STUB_SRCS ${cuda_nvrtc_stub_cpp}) list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB) endif() @@ -512,8 +531,10 @@ list(APPEND ATen_MOBILE_BENCHMARK_SRCS # Pass source, includes, and libs to parent set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) -set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) +set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) +set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 42315a89237..1afab9f446e 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -84,8 +84,10 @@ if(INTERN_BUILD_ATEN_OPS) # Add source, includes, and libs to lists list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) - list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS}) + list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_CPP_SRCS}) list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY}) + list(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS}) + list(APPEND Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY}) list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS}) list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS_W_SORT_BY_KEY}) list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) @@ -171,68 +173,37 @@ endif() if(CAFFE2_ALLOWLISTED_FILES) caffe2_do_allowlist(Caffe2_CPU_SRCS CAFFE2_ALLOWLISTED_FILES) caffe2_do_allowlist(Caffe2_GPU_SRCS CAFFE2_ALLOWLISTED_FILES) + caffe2_do_allowlist(Caffe2_GPU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES) + caffe2_do_allowlist(Caffe2_GPU_CU_SRCS CAFFE2_ALLOWLISTED_FILES) + caffe2_do_allowlist(Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES) caffe2_do_allowlist(Caffe2_HIP_SRCS CAFFE2_ALLOWLISTED_FILES) endif() -if(BUILD_SPLIT_CUDA) - # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp - foreach(tmp ${Caffe2_GPU_SRCS}) - if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort|TensorTopK|TensorModeKernel|IndexKernel|jit_utils).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") - # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files - list(APPEND Caffe2_GPU_SRCS_CU ${tmp}) - else() - list(APPEND Caffe2_GPU_SRCS_CPP ${tmp}) - endif() - endforeach() - - foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) - if("${tmp}" MATCHES ".*aten.*\\.cu" AND NOT "${tmp}" MATCHES ".*TensorFactories.*") - list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU ${tmp}) - else() - list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP ${tmp}) - endif() - endforeach() -endif() - if(PRINT_CMAKE_DEBUG_INFO) message(STATUS "CPU sources: ") foreach(tmp ${Caffe2_CPU_SRCS}) message(STATUS " " ${tmp}) endforeach() - message(STATUS "GPU sources: ") + message(STATUS "GPU sources: (for torch_cuda_cpp)") foreach(tmp ${Caffe2_GPU_SRCS}) message(STATUS " " ${tmp}) endforeach() - if(BUILD_SPLIT_CUDA) - message(STATUS "GPU sources: (for torch_cuda_cpp)") - foreach(tmp ${Caffe2_GPU_SRCS_CPP}) - message(STATUS " " ${tmp}) - endforeach() - - message(STATUS "GPU sources: (for torch_cuda_cu)") - foreach(tmp ${Caffe2_GPU_SRCS_CU}) - message(STATUS " " ${tmp}) - endforeach() - endif() - - message(STATUS "GPU sources (w/ sort by key): ") - foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + message(STATUS "GPU sources: (for torch_cuda_cu)") + foreach(tmp ${Caffe2_GPU_CU_SRCS}) message(STATUS " " ${tmp}) endforeach() - if(BUILD_SPLIT_CUDA) - message(STATUS "torch_cuda_cu GPU sources (w/ sort by key): ") - foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU}) - message(STATUS " " ${tmp}) - endforeach() + message(STATUS "torch_cuda_cu GPU sources (w/ sort by key): ") + foreach(tmp ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) + message(STATUS " " ${tmp}) + endforeach() - message(STATUS "torch_cuda_cpp GPU sources (w/ sort by key): ") - foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP}) - message(STATUS " " ${tmp}) - endforeach() - endif() + message(STATUS "torch_cuda_cpp GPU sources (w/ sort by key): ") + foreach(tmp ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + message(STATUS " " ${tmp}) + endforeach() message(STATUS "CPU include: ") foreach(tmp ${Caffe2_CPU_INCLUDE}) @@ -626,11 +597,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) endif() if(USE_CUDA) - if(BUILD_SPLIT_CUDA) - list(APPEND Caffe2_GPU_SRCS_CU ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS}) - else() - list(APPEND Caffe2_GPU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS}) - endif() + list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS}) add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS}) if(MSVC) # Delay load nvcuda.dll so we can import torch compiled with cuda on a CPU-only machine @@ -641,22 +608,14 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS}) target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS}) install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}") - if(USE_NCCL AND BUILD_SPLIT_CUDA) - list(APPEND Caffe2_GPU_SRCS_CPP - ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp) - elseif(USE_NCCL) + if(USE_NCCL) list(APPEND Caffe2_GPU_SRCS ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp) endif() if(USE_DISTRIBUTED) - if(BUILD_SPLIT_CUDA) - set(_target "Caffe2_GPU_SRCS_CPP") - else() - set(_target "Caffe2_GPU_SRCS") - endif() - append_filelist("libtorch_cuda_distributed_base_sources" ${_target}) + append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS) if(NOT WIN32) - append_filelist("libtorch_cuda_distributed_extra_sources" ${_target}) + append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS) endif() endif() set_source_files_properties( @@ -904,17 +863,21 @@ elseif(USE_CUDA) if(CUDA_SEPARABLE_COMPILATION) # Separate compilation fails when kernels using `thrust::sort_by_key` # are linked with the rest of CUDA code. Workaround by linking them separately. - add_library(torch_cuda ${Caffe2_GPU_SRCS}) + add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_CU_SRCS}) set_property(TARGET torch_cuda PROPERTY CUDA_SEPARABLE_COMPILATION ON) - add_library(torch_cuda_w_sort_by_key OBJECT ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + add_library(torch_cuda_w_sort_by_key OBJECT + ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} + ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF) target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key) elseif(BUILD_SPLIT_CUDA) - add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS_CPP} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP}) - add_library(torch_cuda_cu ${Caffe2_GPU_SRCS_CU} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU}) + add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + add_library(torch_cuda_cu ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) else() - add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) + add_library(torch_cuda + ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} + ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) endif() set(CUDA_LINK_LIBRARIES_KEYWORD) if(BUILD_SPLIT_CUDA) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 88d6eeb18a6..8328391e4d3 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -1314,3 +1314,41 @@ aten_native_source_non_codegen_list = [ # TODO: move the exceptions to proper locations # 2. The whole aten native source list includes the list with and without aten codegen process. aten_native_source_list = sorted(aten_native_source_non_codegen_list + aten_native_source_codegen_list) + +# These are cpp files which need to go in the torch_cuda_cu library +# .cu files can be found via glob +aten_cuda_cu_source_list = [ + "aten/src/ATen/cuda/CUDABlas.cpp", + "aten/src/ATen/cuda/CUDASolver.cpp", + "aten/src/ATen/cuda/CUDASparseBlas.cpp", + "aten/src/ATen/cuda/CublasHandlePool.cpp", + "aten/src/ATen/cuda/CusolverDnHandlePool.cpp", + "aten/src/ATen/native/cuda/Activation.cpp", + "aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp", + "aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp", + "aten/src/ATen/native/cuda/Blas.cpp", + "aten/src/ATen/native/cuda/Equal.cpp", + "aten/src/ATen/native/cuda/IndexKernel.cpp", + "aten/src/ATen/native/cuda/ReduceOps.cpp", + "aten/src/ATen/native/cuda/ScanKernels.cpp", + "aten/src/ATen/native/cuda/Sort.cpp", + "aten/src/ATen/native/cuda/Sorting.cpp", + "aten/src/ATen/native/cuda/TensorModeKernel.cpp", + "aten/src/ATen/native/cuda/TensorShapeCUDA.cpp", + "aten/src/ATen/native/cuda/TensorTopK.cpp", + "aten/src/ATen/native/cuda/jit_utils.cpp", + "aten/src/ATen/native/sparse/cuda/SparseBlas.cpp", + "aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp", + "aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp", + "aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp", +] + +# Files using thrust::sort_by_key need to be linked last +aten_cuda_with_sort_by_key_source_list = [ + # empty_cuda is needed by torch_cuda_cpp + "aten/src/ATen/native/cuda/TensorFactories.cu", +] + +aten_cuda_cu_with_sort_by_key_source_list = [ + "aten/src/ATen/native/cuda/Unique.cu", +]