Update CMake and use native CUDA language support (#62445)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62445

PyTorch currently uses the old style of compiling CUDA in CMake which is just a
bunch of scripts in `FindCUDA.cmake`. Newer versions support CUDA natively as
a language just like C++ or C.

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D31503350

fbshipit-source-id: 2ee817edc9698531ae1b87eda3ad271ee459fd55
This commit is contained in:
Nikita Shulga 2021-10-11 09:04:07 -07:00 committed by Facebook GitHub Bot
parent d3b29afbb6
commit c373387709
23 changed files with 264 additions and 161 deletions

View File

@ -46,7 +46,7 @@ steps:
curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output .\tmp_bin\sccache.exe
curl -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output .\tmp_bin\sccache-cl.exe
copy .\tmp_bin\sccache.exe .\tmp_bin\nvcc.exe
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.3/randomtemp.exe --output .\tmp_bin\randomtemp.exe
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output .\tmp_bin\randomtemp.exe
displayName: Install sccache and randomtemp
condition: not(eq(variables.CUDA_VERSION, ''))

View File

@ -120,9 +120,7 @@ steps:
Write-Host "##vso[task.setvariable variable=CMAKE_LIBRARY_PATH;]$(Build.SourcesDirectory)\mkl\lib;$env:CMAKE_LIBRARY_PATH"
Write-Host "##vso[task.setvariable variable=ADDITIONAL_PATH;]$(Build.SourcesDirectory)\tmp_bin"
Write-Host "##vso[task.setvariable variable=SCCACHE_IDLE_TIMEOUT;]1500"
Write-Host "##vso[task.setvariable variable=RANDOMTEMP_EXECUTABLE;]$(Build.SourcesDirectory)\tmp_bin\nvcc.exe"
Write-Host "##vso[task.setvariable variable=CUDA_NVCC_EXECUTABLE;]$(Build.SourcesDirectory)\tmp_bin\randomtemp.exe"
Write-Host "##vso[task.setvariable variable=RANDOMTEMP_BASEDIR;]$(Build.SourcesDirectory)\tmp_bin"
Write-Host "##vso[task.setvariable variable=CMAKE_CUDA_COMPILER_LAUNCHER;]$(Build.SourcesDirectory)/tmp_bin/randomtemp.exe;$(Build.SourcesDirectory)/tmp_bin/sccache.exe"
displayName: Set MKL, sccache and randomtemp environment variables
# View current environment variables

View File

@ -75,7 +75,7 @@ RUN rm install_cmake.sh
ADD ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
RUN bash ./install_cache.sh && rm install_cache.sh
ENV CUDA_NVCC_EXECUTABLE=/opt/cache/lib/nvcc
ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
# Add jni.h for java host build
ADD ./common/install_jni.sh install_jni.sh
@ -94,6 +94,7 @@ ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
# AWS specific CUDA build guidance
ENV TORCH_CUDA_ARCH_LIST Maxwell
ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
ENV CUDA_PATH /usr/local/cuda
# Install LLVM dev version (Defined in the pytorch/builder github repository)
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm

View File

@ -55,8 +55,8 @@ env:
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
{%- if cuda_version != "cpu" %}
TORCH_CUDA_ARCH_LIST: "7.0"
USE_CUDA: 1
{%- endif %}
USE_CUDA: !{{ 1 if cuda_version != "cpu" else 0 }}
!{{ common.concurrency(build_environment) }}

View File

@ -31,6 +31,7 @@ env:
AWS_DEFAULT_REGION: us-east-1
CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
USE_CUDA: 0
concurrency:
group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}

View File

@ -29,7 +29,8 @@ if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
ln -sf "$(which ccache)" ./ccache/g++
ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
ln -sf "$(which ccache)" ./ccache/nvcc
mkdir -p ./ccache/cuda
ln -sf "$(which ccache)" ./ccache/cuda/nvcc
fi
export CACHE_WRAPPER_DIR="$PWD/ccache"
export PATH="$CACHE_WRAPPER_DIR:$PATH"
@ -93,7 +94,8 @@ if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
# Explicitly set path to NVCC such that the symlink to ccache or sccache is used
if [ -n "${CACHE_WRAPPER_DIR}" ]; then
build_args+=("CUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
build_args+=("CUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/cuda/nvcc")
build_args+=("CMAKE_CUDA_COMPILER_LAUNCHER=${CACHE_WRAPPER_DIR}/ccache")
fi
# Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.

View File

@ -97,23 +97,20 @@ set CXX=sccache-cl
set CMAKE_GENERATOR=Ninja
if "%USE_CUDA%"=="1" (
copy %TMP_DIR_WIN%\bin\sccache.exe %TMP_DIR_WIN%\bin\nvcc.exe
:: randomtemp is used to resolve the intermittent build error related to CUDA.
:: code: https://github.com/peterjc123/randomtemp-rust
:: issue: https://github.com/pytorch/pytorch/issues/25393
::
:: Previously, CMake uses CUDA_NVCC_EXECUTABLE for finding nvcc and then
:: the calls are redirected to sccache. sccache looks for the actual nvcc
:: in PATH, and then pass the arguments to it.
:: Currently, randomtemp is placed before sccache (%TMP_DIR_WIN%\bin\nvcc)
:: so we are actually pretending sccache instead of nvcc itself.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.3/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
:: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
:: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
set RANDOMTEMP_EXECUTABLE=%TMP_DIR_WIN%\bin\nvcc.exe
set CUDA_NVCC_EXECUTABLE=%TMP_DIR_WIN%\bin\randomtemp.exe
set RANDOMTEMP_BASEDIR=%TMP_DIR_WIN%\bin
echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
cat %TMP_DIR%/bin/nvcc.bat
set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
for /F "usebackq delims=" %%n in (`cygpath -m "%CUDA_PATH%\bin\nvcc.exe"`) do set CMAKE_CUDA_COMPILER=%%n
set CMAKE_CUDA_COMPILER_LAUNCHER=%TMP_DIR%/bin/randomtemp.exe;%TMP_DIR%\bin\sccache.exe
)
@echo off

View File

@ -517,16 +517,14 @@ if(MSVC)
endforeach(flag_var)
# Try harder
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "/w" "-w")
string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /w -w")
endif(MSVC)
list(APPEND CUDA_NVCC_FLAGS "-Xfatbin" "-compress-all")
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-Xfatbin" "-compress-all")
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO "-Xfatbin" "-compress-all")
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
if(NOT MSVC)
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-g" "-lineinfo" "--source-in-ptx")
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO "-g" "-lineinfo" "--source-in-ptx")
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -g -lineinfo --source-in-ptx")
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -g -lineinfo --source-in-ptx")
endif(NOT MSVC)
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
@ -667,6 +665,16 @@ endif()
include(cmake/Dependencies.cmake)
if((CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.2) AND (CMAKE_HOST_SYSTEM_NAME MATCHES "Windows"))
# CUDA < 10.2 doesn't support compiling and extracting header dependencies in
# one call, so instead CMake calls nvcc twice with && in between.
# However, on windows cmd.exe has a 8191 character limit for commands which we
# start hitting. This moves most argments into a file to avoid going over the limit
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS ON)
set(CMAKE_NINJA_FORCE_RESPONSE_FILE ON CACHE INTERNAL "")
endif()
if(USE_FBGEMM)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
endif()

View File

@ -69,12 +69,6 @@ if(USE_CUDA AND USE_ROCM)
message(FATAL_ERROR "Both CUDA and ROCm are enabled and found. PyTorch can only be built with either of them. Please turn one off by using either USE_CUDA=OFF or USE_ROCM=OFF.")
endif()
if(MSVC)
# we want to respect the standard, and we are bored of those **** .
add_definitions(-D_CRT_SECURE_NO_DEPRECATE=1)
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "/wd4819" "-Xcompiler" "/wd4503" "-Xcompiler" "/wd4190" "-Xcompiler" "/wd4244" "-Xcompiler" "/wd4251" "-Xcompiler" "/wd4275" "-Xcompiler" "/wd4522")
endif(MSVC)
if(USE_ROCM)
# TODO: AT_HIP_ENABLED (change this once we represent HIP as HIP in
# ATen proper)

View File

@ -49,9 +49,7 @@ if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
endif()
# ---[ Dependency of c10_cuda
target_link_libraries(c10_cuda PUBLIC c10)
target_link_libraries(c10_cuda INTERFACE torch::cudart)
target_link_libraries(c10_cuda PUBLIC c10 torch::cudart)
target_include_directories(
c10_cuda PUBLIC

View File

@ -895,19 +895,18 @@ elseif(USE_CUDA)
set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
if(CUDA_SEPARABLE_COMPILATION)
# Separate compilation fails when kernels using `thrust::sort_by_key`
# are linked with the rest of CUDA code. Workaround by linking them separately
set(_generated_name "torch_cuda_w_sort_by_key_intermediate_link${CMAKE_C_OUTPUT_EXTENSION}")
set(torch_cuda_w_sort_by_key_link_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/torch_cuda.dir/${CMAKE_CFG_INTDIR}/${_generated_name}")
cuda_wrap_srcs(torch_cuda OBJ Caffe2_GPU_W_SORT_BY_KEY_OBJ ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${torch_cuda_w_sort_by_key_link_file}" torch_cpu "${_options}" "${torch_cuda_SEPARABLE_COMPILATION_OBJECTS}")
set( torch_cuda_SEPARABLE_COMPILATION_OBJECTS )
# Pass compiled sort-by-key object + device-linked fatbin as extra dependencies of torch_cuda
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${torch_cuda_w_sort_by_key_link_file} ${Caffe2_GPU_W_SORT_BY_KEY_OBJ})
# are linked with the rest of CUDA code. Workaround by linking them separately.
add_library(torch_cuda ${Caffe2_GPU_SRCS})
set_property(TARGET torch_cuda PROPERTY CUDA_SEPARABLE_COMPILATION ON)
add_library(torch_cuda_w_sort_by_key OBJECT ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF)
target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key)
elseif(BUILD_SPLIT_CUDA)
cuda_add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS_CPP} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP})
cuda_add_library(torch_cuda_cu ${Caffe2_GPU_SRCS_CU} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU})
add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS_CPP} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CPP})
add_library(torch_cuda_cu ${Caffe2_GPU_SRCS_CU} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY_CU})
else()
cuda_add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
endif()
set(CUDA_LINK_LIBRARIES_KEYWORD)
if(BUILD_SPLIT_CUDA)
@ -1803,7 +1802,7 @@ if(BUILD_TEST)
if(USE_CUDA)
foreach(test_src ${Caffe2_GPU_TEST_SRCS})
get_filename_component(test_name ${test_src} NAME_WE)
cuda_add_executable(${test_name} "${test_src}")
add_executable(${test_name} "${test_src}")
target_link_libraries(${test_name} torch_library gtest_main)
target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})

View File

@ -33,6 +33,50 @@ macro(enable_ubsan)
endif()
endmacro()
# ---[ CUDA
if(USE_CUDA)
# public/*.cmake uses CAFFE2_USE_*
set(CAFFE2_USE_CUDA ${USE_CUDA})
set(CAFFE2_USE_CUDNN ${USE_CUDNN})
set(CAFFE2_USE_NVRTC ${USE_NVRTC})
set(CAFFE2_USE_TENSORRT ${USE_TENSORRT})
include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
if(CAFFE2_USE_CUDA)
# A helper variable recording the list of Caffe2 dependent libraries
# torch::cudart is dealt with separately, due to CUDA_ADD_LIBRARY
# design reason (it adds CUDA_LIBRARIES itself).
set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
caffe2::cufft caffe2::curand caffe2::cublas)
if(CAFFE2_USE_NVRTC)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cuda caffe2::nvrtc)
else()
caffe2_update_option(USE_NVRTC OFF)
endif()
if(CAFFE2_USE_CUDNN)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn-public)
else()
caffe2_update_option(USE_CUDNN OFF)
endif()
if(CAFFE2_USE_TENSORRT)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt)
else()
caffe2_update_option(USE_TENSORRT OFF)
endif()
else()
message(WARNING
"Not compiling with CUDA. Suppress this warning with "
"-DUSE_CUDA=OFF.")
caffe2_update_option(USE_CUDA OFF)
caffe2_update_option(USE_CUDNN OFF)
caffe2_update_option(USE_NVRTC OFF)
caffe2_update_option(USE_TENSORRT OFF)
set(CAFFE2_USE_CUDA OFF)
set(CAFFE2_USE_CUDNN OFF)
set(CAFFE2_USE_NVRTC OFF)
set(CAFFE2_USE_TENSORRT OFF)
endif()
endif()
# ---[ Custom Protobuf
if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
disable_ubsan()
@ -77,8 +121,8 @@ endif(MSVC)
# ---[ Threads
include(${CMAKE_CURRENT_LIST_DIR}/public/threads.cmake)
if(TARGET Threads::Threads)
list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS Threads::Threads)
if(TARGET caffe2::Threads)
list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::Threads)
else()
message(FATAL_ERROR
"Cannot find threading library. Caffe2 requires Threads to compile.")
@ -661,7 +705,7 @@ if(BUILD_TEST OR BUILD_MOBILE_BENCHMARK OR BUILD_MOBILE_TEST)
# We need to replace googletest cmake scripts too.
# Otherwise, it will sometimes break the build.
# To make the git clean after the build, we make a backup first.
if(MSVC AND MSVC_Z7_OVERRIDE)
if((MSVC AND MSVC_Z7_OVERRIDE) OR USE_CUDA)
execute_process(
COMMAND ${CMAKE_COMMAND}
"-DFILENAME=${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/cmake/internal_utils.cmake"
@ -1181,50 +1225,6 @@ if(USE_LLVM)
endif(LLVM_FOUND)
endif(USE_LLVM)
# ---[ CUDA
if(USE_CUDA)
# public/*.cmake uses CAFFE2_USE_*
set(CAFFE2_USE_CUDA ${USE_CUDA})
set(CAFFE2_USE_CUDNN ${USE_CUDNN})
set(CAFFE2_USE_NVRTC ${USE_NVRTC})
set(CAFFE2_USE_TENSORRT ${USE_TENSORRT})
include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
if(CAFFE2_USE_CUDA)
# A helper variable recording the list of Caffe2 dependent libraries
# torch::cudart is dealt with separately, due to CUDA_ADD_LIBRARY
# design reason (it adds CUDA_LIBRARIES itself).
set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
caffe2::cufft caffe2::curand caffe2::cublas)
if(CAFFE2_USE_NVRTC)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cuda caffe2::nvrtc)
else()
caffe2_update_option(USE_NVRTC OFF)
endif()
if(CAFFE2_USE_CUDNN)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn-public)
else()
caffe2_update_option(USE_CUDNN OFF)
endif()
if(CAFFE2_USE_TENSORRT)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt)
else()
caffe2_update_option(USE_TENSORRT OFF)
endif()
else()
message(WARNING
"Not compiling with CUDA. Suppress this warning with "
"-DUSE_CUDA=OFF.")
caffe2_update_option(USE_CUDA OFF)
caffe2_update_option(USE_CUDNN OFF)
caffe2_update_option(USE_NVRTC OFF)
caffe2_update_option(USE_TENSORRT OFF)
set(CAFFE2_USE_CUDA OFF)
set(CAFFE2_USE_CUDNN OFF)
set(CAFFE2_USE_NVRTC OFF)
set(CAFFE2_USE_TENSORRT OFF)
endif()
endif()
# ---[ cuDNN
if(USE_CUDNN)
set(CUDNN_FRONTEND_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/cudnn_frontend/include)
@ -1371,6 +1371,8 @@ if(USE_GLOO)
set(ENV{GLOO_ROCM_ARCH} "${PYTORCH_ROCM_ARCH}")
endif()
if(NOT USE_SYSTEM_GLOO)
# gloo uses cuda_add_library
torch_update_find_cuda_flags()
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
else()
add_library(gloo SHARED IMPORTED)
@ -1417,6 +1419,8 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
# Tensorpipe uses cuda_add_library
torch_update_find_cuda_flags()
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
@ -1560,7 +1564,6 @@ function(add_onnx_tensorrt_subdir)
endfunction()
if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
if(USE_TENSORRT)
set(CMAKE_CUDA_COMPILER ${CUDA_NVCC_EXECUTABLE})
add_onnx_tensorrt_subdir()
include_directories("${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx-tensorrt")
caffe2_interface_library(nvonnxparser_static onnx_trt_library)
@ -1579,8 +1582,7 @@ endif()
if(NOT INTERN_BUILD_MOBILE)
set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
separate_arguments(TORCH_NVCC_FLAGS)
string(APPEND CMAKE_CUDA_FLAGS " $ENV{TORCH_NVCC_FLAGS}")
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
# Top-level build config
@ -1599,7 +1601,7 @@ if(NOT INTERN_BUILD_MOBILE)
if(MSVC)
# we want to respect the standard, and we are bored of those **** .
add_definitions(-D_CRT_SECURE_NO_DEPRECATE=1)
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=/wd4819,/wd4503,/wd4190,/wd4244,/wd4251,/wd4275,/wd4522")
string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler=/wd4819,/wd4503,/wd4190,/wd4244,/wd4251,/wd4275,/wd4522")
endif()
if(NOT MSVC)
@ -1610,22 +1612,19 @@ if(NOT INTERN_BUILD_MOBILE)
endif()
endif()
list(APPEND CUDA_NVCC_FLAGS -Wno-deprecated-gpu-targets)
list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
string(APPEND CMAKE_CUDA_FLAGS " -Wno-deprecated-gpu-targets --expt-extended-lambda")
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_STANDARD 14)
endif()
list(APPEND CUDA_NVCC_FLAGS ${TORCH_NVCC_FLAGS})
if(CMAKE_POSITION_INDEPENDENT_CODE AND NOT MSVC)
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-fPIC")
endif()
if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__"
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__")
string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
" -D__CUDA_NO_HALF_OPERATORS__"
" -D__CUDA_NO_HALF_CONVERSIONS__"
" -D__CUDA_NO_HALF2_OPERATORS__"
" -D__CUDA_NO_BFLOAT16_CONVERSIONS__")
add_compile_options(-DCUDA_HAS_FP16=1)
else()
message(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")

View File

@ -20,5 +20,6 @@ else(REVERT)
file(READ ${FILENAME} content)
file(WRITE ${BACKUP} "${content}")
string(REGEX REPLACE "[-/]Z[iI]" "/Z7" content "${content}")
string(REGEX REPLACE "Threads::Threads" "caffe2::Threads" content "${content}")
file(WRITE ${FILENAME} "${content}")
endif(REVERT)

View File

@ -3,6 +3,7 @@
# CUB_INCLUDE_DIRS - the CUB include directory
find_path(CUB_INCLUDE_DIR
HINTS "${CUDA_TOOLKIT_INCLUDE}"
NAMES cub/cub.cuh
DOC "The directory where CUB includes reside"
)

View File

@ -103,9 +103,10 @@ function(caffe2_print_configuration_summary)
message(STATUS " nvrtc : ${__tmp}")
message(STATUS " CUDA include path : ${CUDA_INCLUDE_DIRS}")
message(STATUS " NVCC executable : ${CUDA_NVCC_EXECUTABLE}")
message(STATUS " NVCC flags : ${CUDA_NVCC_FLAGS}")
message(STATUS " CUDA host compiler : ${CUDA_HOST_COMPILER}")
message(STATUS " NVCC --device-c : ${CUDA_SEPARABLE_COMPILATION}")
message(STATUS " CUDA compiler : ${CMAKE_CUDA_COMPILER}")
message(STATUS " CUDA flags : ${CMAKE_CUDA_FLAGS}")
message(STATUS " CUDA host compiler : ${CMAKE_CUDA_HOST_COMPILER}")
message(STATUS " CUDA --device-c : ${CUDA_SEPARABLE_COMPILATION}")
message(STATUS " USE_TENSORRT : ${USE_TENSORRT}")
if(${USE_TENSORRT})
message(STATUS " TensorRT runtime library: ${TENSORRT_LIBRARY}")

View File

@ -35,6 +35,13 @@ if(NOT CUDA_FOUND)
set(CAFFE2_USE_CUDA OFF)
return()
endif()
# Enable CUDA language support
set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}")
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@ -435,6 +442,8 @@ endif()
# setting nvcc arch flags
torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
# CMake 3.18 adds integrated support for architecture selection, but we can't rely on it
set(CMAKE_CUDA_ARCHITECTURES OFF)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}")
@ -453,14 +462,10 @@ endforeach()
string(REPLACE ";" "," SUPPRESS_WARNING_FLAGS "${SUPPRESS_WARNING_FLAGS}")
list(APPEND CUDA_NVCC_FLAGS -Xcudafe ${SUPPRESS_WARNING_FLAGS})
# Set C++14 support
set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror")
if(MSVC)
list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call")
list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward")
else()
list(APPEND CUDA_NVCC_FLAGS "-std=c++14")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-fPIC")
endif()
# OpenMP flags for NVCC with Clang-cl
@ -477,9 +482,15 @@ endif()
# Debug and Release symbol support
if(MSVC)
if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-MT$<$<CONFIG:Debug>:d>")
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -Xcompiler /MTd")
string(APPEND CMAKE_CUDA_FLAGS_MINSIZEREL " -Xcompiler /MT")
string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -Xcompiler /MT")
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -Xcompiler /MT")
else()
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-MD$<$<CONFIG:Debug>:d>")
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -Xcompiler /MDd")
string(APPEND CMAKE_CUDA_FLAGS_MINSIZEREL " -Xcompiler /MD")
string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -Xcompiler /MD")
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -Xcompiler /MD")
endif()
if(CUDA_NVCC_FLAGS MATCHES "Zi")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-FS")
@ -493,3 +504,11 @@ list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
# Set expt-extended-lambda to support lambda on device
list(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda")
foreach(FLAG ${CUDA_NVCC_FLAGS})
string(FIND "${FLAG}" " " flag_space_position)
if(NOT flag_space_position EQUAL -1)
message(FATAL_ERROR "Found spaces in CUDA_NVCC_FLAGS entry '${FLAG}'")
endif()
string(APPEND CMAKE_CUDA_FLAGS " ${FLAG}")
endforeach()

View File

@ -1,16 +1,29 @@
if(TARGET caffe2::Threads)
return()
endif()
find_package(Threads REQUIRED)
# For newer CMake, Threads::Threads is already defined. Otherwise, we will
# provide a backward compatible wrapper for Threads::Threads.
if(THREADS_FOUND AND NOT TARGET Threads::Threads)
add_library(Threads::Threads INTERFACE IMPORTED)
# Threads::Threads doesn't work if the target has CUDA code
if(THREADS_FOUND)
add_library(caffe2::Threads INTERFACE IMPORTED)
if(THREADS_HAVE_PTHREAD_ARG)
set_property(TARGET Threads::Threads
PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
set(compile_options
$<$<COMPILE_LANGUAGE:C>:-pthread>
$<$<COMPILE_LANGUAGE:CXX>:-pthread>)
if(USE_CUDA)
list(APPEND compile_options
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler -pthread>)
endif()
set_property(TARGET caffe2::Threads
PROPERTY INTERFACE_COMPILE_OPTIONS
${compile_options})
endif()
if(CMAKE_THREAD_LIBS_INIT)
set_property(TARGET Threads::Threads
set_property(TARGET caffe2::Threads
PROPERTY INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
endif()
endif()

View File

@ -348,7 +348,7 @@ macro(torch_cuda_based_add_library cuda_target)
if(USE_ROCM)
hip_add_library(${cuda_target} ${ARGN})
elseif(USE_CUDA)
cuda_add_library(${cuda_target} ${ARGN})
add_library(${cuda_target} ${ARGN})
else()
endif()
endmacro()
@ -388,10 +388,11 @@ endmacro()
# torch_compile_options(lib_name)
function(torch_compile_options libname)
set_property(TARGET ${libname} PROPERTY CXX_STANDARD 14)
set(private_compile_options "")
# ---[ Check if warnings should be errors.
if(WERROR)
target_compile_options(${libname} PRIVATE -Werror)
list(APPEND private_compile_options -Werror)
endif()
if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
@ -405,38 +406,50 @@ function(torch_compile_options libname)
endif()
target_compile_options(${libname} PUBLIC
${MSVC_RUNTIME_LIBRARY_OPTION}
$<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:${MSVC_DEBINFO_OPTION}>
/EHsc
/DNOMINMAX
/wd4267
/wd4251
/wd4522
/wd4522
/wd4838
/wd4305
/wd4244
/wd4190
/wd4101
/wd4996
/wd4275
/bigobj
$<$<COMPILE_LANGUAGE:CXX>:
${MSVC_RUNTIME_LIBRARY_OPTION}
$<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:${MSVC_DEBINFO_OPTION}>
/EHsc
/DNOMINMAX
/wd4267
/wd4251
/wd4522
/wd4522
/wd4838
/wd4305
/wd4244
/wd4190
/wd4101
/wd4996
/wd4275
/bigobj>
)
else()
target_compile_options(${libname} PRIVATE
list(APPEND private_compile_options
-Wall
-Wextra
-Wno-unused-parameter
-Wno-unused-variable
-Wno-unused-function
-Wno-unused-result
-Wno-unused-local-typedefs
-Wno-missing-field-initializers
-Wno-write-strings
-Wno-unknown-pragmas
-Wno-type-limits
-Wno-array-bounds
-Wno-unknown-pragmas
-Wno-sign-compare
-Wno-strict-overflow
-Wno-strict-aliasing
-Wno-error=deprecated-declarations
# Clang has an unfixed bug leading to spurious missing braces
# warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
-Wno-missing-braces
)
if(NOT APPLE)
target_compile_options(${libname} PRIVATE
list(APPEND private_compile_options
# Considered to be flaky. See the discussion at
# https://github.com/pytorch/pytorch/pull/9608
-Wno-maybe-uninitialized)
@ -446,10 +459,23 @@ function(torch_compile_options libname)
if(MSVC)
elseif(WERROR)
target_compile_options(${libname} PRIVATE -Wno-strict-overflow)
list(APPEND private_compile_options -Wno-strict-overflow)
endif()
endif()
target_compile_options(${libname} PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:${private_compile_options}>)
if(USE_CUDA)
string(FIND "${private_compile_options}" " " space_position)
if(NOT space_position EQUAL -1)
message(FATAL_ERROR "Found spaces in private_compile_options='${private_compile_options}'")
endif()
# Convert CMake list to comma-separated list
string(REPLACE ";" "," private_compile_options "${private_compile_options}")
target_compile_options(${libname} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${private_compile_options}>)
endif()
if(NOT WIN32 AND NOT USE_ASAN)
# Enable hidden visibility by default to make it easier to debug issues with
# TORCH_API annotations. Hidden visibility with selective default visibility
@ -458,11 +484,13 @@ function(torch_compile_options libname)
# Unfortunately, hidden visibility messes up some ubsan warnings because
# templated classes crossing library boundary get duplicated (but identical)
# definitions. It's easier to just disable it.
target_compile_options(${libname} PRIVATE "-fvisibility=hidden")
target_compile_options(${libname} PRIVATE
$<$<COMPILE_LANGUAGE:CXX>: -fvisibility=hidden>)
endif()
# Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
target_compile_options(${libname} PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
target_compile_options(${libname} PRIVATE
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O2>)
endfunction()
@ -484,3 +512,40 @@ function(torch_set_target_props libname)
set_target_properties(${libname} PROPERTIES STATIC_LIBRARY_FLAGS_DEBUG "/NODEFAULTLIB:${VCOMP_LIB}d")
endif()
endfunction()
##############################################################################
# Set old-style FindCuda.cmake compile flags from modern CMake cuda flags.
# Usage:
# torch_update_find_cuda_flags()
function(torch_update_find_cuda_flags)
# Convert -O2 -Xcompiler="-O2 -Wall" to "-O2;-Xcompiler=-O2,-Wall"
if(USE_CUDA)
separate_arguments(FLAGS UNIX_COMMAND "${CMAKE_CUDA_FLAGS}")
string(REPLACE " " "," FLAGS "${FLAGS}")
set(CUDA_NVCC_FLAGS ${FLAGS} PARENT_SCOPE)
separate_arguments(FLAGS_DEBUG UNIX_COMMAND "${CMAKE_CUDA_FLAGS_DEBUG}")
string(REPLACE " " "," FLAGS_DEBUG "${FLAGS_DEBUG}")
set(CUDA_NVCC_FLAGS_DEBUG "${FLAGS_DEBUG}" PARENT_SCOPE)
separate_arguments(FLAGS_RELEASE UNIX_COMMAND "${CMAKE_CUDA_FLAGS_RELEASE}")
string(REPLACE " " "," FLAGS_RELEASE "${FLAGS_RELEASE}")
set(CUDA_NVCC_FLAGS_RELEASE "${FLAGS_RELEASE}" PARENT_SCOPE)
separate_arguments(FLAGS_MINSIZEREL UNIX_COMMAND "${CMAKE_CUDA_FLAGS_MINSIZEREL}")
string(REPLACE " " "," FLAGS_MINSIZEREL "${FLAGS_MINSIZEREL}")
set(CUDA_NVCC_FLAGS_MINSIZEREL "${FLAGS_MINSIZEREL}" PARENT_SCOPE)
separate_arguments(FLAGS_RELWITHDEBINFO UNIX_COMMAND "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO}")
string(REPLACE " " "," FLAGS_RELWITHDEBINFO "${FLAGS_RELWITHDEBINFO}")
set(CUDA_NVCC_FLAGS_RELWITHDEBINFO "${FLAGS_RELWITHDEBINFO}" PARENT_SCOPE)
message(STATUS "Converting CMAKE_CUDA_FLAGS to CUDA_NVCC_FLAGS:\n"
" CUDA_NVCC_FLAGS = ${FLAGS}\n"
" CUDA_NVCC_FLAGS_DEBUG = ${FLAGS_DEBUG}\n"
" CUDA_NVCC_FLAGS_RELEASE = ${FLAGS_RELEASE}\n"
" CUDA_NVCC_FLAGS_RELWITHDEBINFO = ${FLAGS_RELWITHDEBINFO}\n"
" CUDA_NVCC_FLAGS_MINSIZEREL = ${FLAGS_MINSIZEREL}")
endif()
endfunction()

View File

@ -10,7 +10,7 @@ if(BUILD_CAFFE2_OPS)
# Note(ilijar): Since Detectron ops currently have no
# CPU implementation, we only build GPU ops for now.
if(USE_CUDA)
CUDA_ADD_LIBRARY(
add_library(
caffe2_detectron_ops_gpu SHARED
${Detectron_CPU_SRCS}
${Detectron_GPU_SRCS})

View File

@ -1,5 +1,5 @@
if(USE_CUDA)
cuda_add_library(c10d_cuda_test CUDATest.cu)
add_library(c10d_cuda_test CUDATest.cu)
target_include_directories(c10d_cuda_test PRIVATE $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>)
target_link_libraries(c10d_cuda_test torch_cuda)
add_dependencies(c10d_cuda_test torch_cuda)

View File

@ -243,7 +243,7 @@ class CMake:
var: var for var in
('BLAS',
'BUILDING_WITH_TORCH_LIBS',
'CUDA_HOST_COMPILER',
'CUDA_HOST_COMILER',
'CUDA_NVCC_EXECUTABLE',
'CUDA_SEPARABLE_COMPILATION',
'CUDNN_LIBRARY',
@ -267,6 +267,15 @@ class CMake:
'OPENSSL_ROOT_DIR')
})
# Aliases which are lower priority than their canonical option
low_priority_aliases = {
'CUDA_HOST_COMPILER': 'CMAKE_CUDA_HOST_COMPILER',
'CUDAHOSTCXX': 'CUDA_HOST_COMPILER',
'CMAKE_CUDA_HOST_COMPILER': 'CUDA_HOST_COMPILER',
'CMAKE_CUDA_COMPILER': 'CUDA_NVCC_EXECUTABLE',
'CUDACXX': 'CUDA_NVCC_EXECUTABLE'
}
for var, val in my_env.items():
# We currently pass over all environment variables that start with "BUILD_", "USE_", and "CMAKE_". This is
# because we currently have no reliable way to get the list of all build options we have specified in
@ -279,6 +288,11 @@ class CMake:
elif var.startswith(('BUILD_', 'USE_', 'CMAKE_')) or var.endswith(('EXITCODE', 'EXITCODE__TRYRUN_OUTPUT')):
build_options[var] = val
if var in low_priority_aliases:
key = low_priority_aliases[var]
if key not in build_options:
build_options[key] = val
# The default value cannot be easily obtained in CMakeLists.txt. We set it here.
py_lib_path = sysconfig.get_path('purelib')
cmake_prefix_path = build_options.get('CMAKE_PREFIX_PATH', None)

View File

@ -111,6 +111,7 @@ else()
endif()
if(USE_CUDA)
include(${TORCH_ROOT}/cmake/public/cuda.cmake)
append_filelist("libtorch_python_cuda_core_sources" TORCH_PYTHON_SRCS)
list(APPEND TORCH_PYTHON_SRCS ${GENERATED_THNN_CXX_CUDA})
@ -119,16 +120,7 @@ if(USE_CUDA)
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
endif()
if(MSVC)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib)
list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES "${NVTOOLEXT_HOME}/include")
elseif(APPLE)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib)
else()
find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${LIBNVTOOLSEXT})
endif()
list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
endif()
if(USE_ROCM)

View File

@ -67,13 +67,13 @@ if(UNIX AND NOT APPLE)
# site above though in case there was a reason we were testing
# against clock_gettime. In principle, the choice of symbol you
# test for shouldn't matter.
set(CMAKE_REQUIRED_LIBRARIES Threads::Threads)
set(CMAKE_REQUIRED_LIBRARIES caffe2::Threads)
check_library_exists(rt shm_open "sys/mman.h" NEED_RT_AND_PTHREAD)
unset(CMAKE_REQUIRED_LIBRARIES)
if(NEED_RT_AND_PTHREAD)
message(STATUS "Needs it, linking against pthread and rt")
target_link_libraries(shm rt Threads::Threads)
target_link_libraries(torch_shm_manager rt Threads::Threads)
target_link_libraries(shm rt caffe2::Threads)
target_link_libraries(torch_shm_manager rt caffe2::Threads)
endif()
endif()
endif()