mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: This PR contains the initial version of Vulkan (GPU) Backend integration. The primary target environment is Android, but the desktop build is also supported. ## CMake Introducing three cmake options: USE_VULKAN: The main switch, if it is off, all other options do not affect. USE_VULKAN_WRAPPER: ON - Vulkan will be used loading it at runtime as "libvulkan.so" using libdl, every function call is wrapped in vulkan_wrapper.h. OFF - linking with libvulkan.so directly USE_VULKAN_SHADERC_RUNTIME: ON - Shader compilation library will be linked, and shaders will be compiled runtime. OFF - Shaders will be precompiled and shader compilation library is not included. ## Codegen if `USE_VULKAN_SHADERC_RUNTIME` is ON: Shaders precompilation () starts in cmake/VulkanCodegen.cmake, which calls `aten/src/ATen/native/vulkan/gen_glsl.py` or `aten/src/ATen/native/vulkan/gen_spv.py` to include shaders source or SPIR-V bytecode inside binary as uint32_t array in spv.h,spv.cpp. if `USE_VULKAN_SHADERC_RUNTIME` is OFF: The source of shaders is included as `glsl.h`,`glsl.cpp`. All codegen results happen in the build directory. ## Build dependencies cmake/Dependencies.cmake If the target platform is Android - vulkan library, headers, Vulkan wrapper will be used from ANDROID_NDK. Desktop build requires the VULKAN_SDK environment variable, and all vulkan dependencies will be used from it. (Desktop build was tested only on Linux). ## Pytorch integration: Adding 'Vulkan" as new Backend, DispatchKey, DeviceType. We are using Strided layout without supporting strides at the moment, but we plan to support them in the future. Using OpaqueTensorImpl where OpaqueHandle is copyable VulkanTensor, more details in comments in `aten/src/ATen/native/vulkan/Vulkan.h` Main code location: `aten/src/ATen/native/vulkan` `aten/src/ATen/native/vulkan/VulkanAten.cpp` - connection link between ATen and Vulkan api (Vulkan.h) that converts at::Tensor to VulkanTensor. `aten/src/ATen/native/Vulkan/Vulkan.h` - Vulkan API that contains VulkanTensor representation and functions to work with it. Plan to expose it for clients to be able to write their own Vulkan Ops. `aten/src/ATen/native/vulkan/VulkanOps.cpp` - Vulkan Operations Implementations that uses Vulkan.h API ## GLSL shaders Located in `aten/src/ATen/native/vulkan/glsl` as *.glsl files. All shaders use Vulkan specialized constants for workgroup sizes with ids 1, 2, 3 ## Supported operations Code point: conv2d no-groups conv2d depthwise addmm upsample nearest 2d clamp hardtanh ## Testing `aten/src/ATen/test/vulkan_test.cpp` - contains tests for copy from CPU to Vulkan and back all supported operations Desktop builds supported, and testing can be done on a desktop that has Vulkan supported GPU or with installed software implementation of Vulkan, like https://github.com/google/swiftshader ## Vulkan execution The initial implementation is trivial and waits every operator's execution. Pull Request resolved: https://github.com/pytorch/pytorch/pull/36491 Differential Revision: D21696709 Pulled By: IvanKobzarev fbshipit-source-id: da3e5a770b1a1995e9465d7e81963e7de56217fa
251 lines
10 KiB
CMake
251 lines
10 KiB
CMake
# This ill-named file does a number of things:
|
|
# - Installs Caffe2 header files (this has nothing to do with code generation)
|
|
# - Configures caffe2/core/macros.h
|
|
# - Creates an ATen target for its generated C++ files and adds it
|
|
# as a dependency
|
|
# - Reads build lists defined in build_variables.bzl
|
|
|
|
################################################################################
|
|
# Helper functions
|
|
################################################################################
|
|
|
|
function(filter_list output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if("${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
function(filter_list_exclude output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if(NOT "${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
################################################################################
|
|
|
|
# ---[ Write the macros file
|
|
configure_file(
|
|
${CMAKE_CURRENT_LIST_DIR}/../caffe2/core/macros.h.in
|
|
${CMAKE_BINARY_DIR}/caffe2/core/macros.h)
|
|
|
|
# ---[ Installing the header files
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
|
|
DESTINATION include
|
|
FILES_MATCHING PATTERN "*.h")
|
|
if(NOT INTERN_BUILD_ATEN_OPS)
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
|
|
DESTINATION include/ATen
|
|
FILES_MATCHING PATTERN "*.h")
|
|
endif()
|
|
install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
|
|
DESTINATION include/caffe2/core)
|
|
|
|
# ---[ ATen specific
|
|
if(INTERN_BUILD_ATEN_OPS)
|
|
if(MSVC)
|
|
set(OPT_FLAG "/fp:strict ")
|
|
else(MSVC)
|
|
set(OPT_FLAG "-O3 ")
|
|
if("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
|
|
set(OPT_FLAG " ")
|
|
endif()
|
|
endif(MSVC)
|
|
|
|
if(C_AVX_FOUND)
|
|
if(MSVC)
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/arch:AVX ${CXX_AVX_FLAGS}")
|
|
else(MSVC)
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} ${CXX_AVX_FLAGS}")
|
|
endif(MSVC)
|
|
endif(C_AVX_FOUND)
|
|
|
|
if(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/THAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
|
|
endif()
|
|
|
|
file(GLOB cpu_kernel_cpp_in "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cpu/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/quantized/cpu/kernels/*.cpp")
|
|
|
|
list(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
|
|
|
|
if(CXX_AVX_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX_CPU_DEFINITION")
|
|
list(APPEND CPU_CAPABILITY_NAMES "AVX")
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx")
|
|
endif(MSVC)
|
|
endif(CXX_AVX_FOUND)
|
|
|
|
if(CXX_AVX2_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
|
|
|
|
# Some versions of GCC pessimistically split unaligned load and store
|
|
# instructions when using the default tuning. This is a bad choice on
|
|
# new Intel and AMD processors so we disable it when compiling with AVX2.
|
|
# See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
|
|
check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
if(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
|
|
endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
|
|
list(APPEND CPU_CAPABILITY_NAMES "AVX2")
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
|
|
endif(MSVC)
|
|
endif(CXX_AVX2_FOUND)
|
|
|
|
list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
|
|
math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
|
|
|
|
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
|
foreach(IMPL ${cpu_kernel_cpp_in})
|
|
string(REPLACE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/" "" NAME ${IMPL})
|
|
list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
|
|
set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
|
|
configure_file(${IMPL} ${NEW_IMPL} COPYONLY)
|
|
set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
|
|
list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
|
|
if(MSVC)
|
|
set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
else(MSVC)
|
|
set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
endif(MSVC)
|
|
# Disable certain warnings for GCC-9.X
|
|
if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
|
|
if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT"))
|
|
# See https://github.com/pytorch/pytorch/issues/38855
|
|
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized")
|
|
endif()
|
|
if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp")
|
|
# See https://github.com/pytorch/pytorch/issues/38854
|
|
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy")
|
|
endif()
|
|
endif()
|
|
set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}")
|
|
endforeach()
|
|
endforeach()
|
|
list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
|
|
|
|
set(cwrap_files
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/Declarations.cwrap
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/THCUNN/generic/THCUNN.h
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/nn.yaml
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml)
|
|
|
|
file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/*.py")
|
|
|
|
set(GEN_ROCM_FLAG)
|
|
if(USE_ROCM)
|
|
set(GEN_ROCM_FLAG --rocm)
|
|
endif()
|
|
|
|
set(CUSTOM_BUILD_FLAGS)
|
|
if(INTERN_BUILD_MOBILE)
|
|
if(USE_VULKAN)
|
|
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU Vulkan)
|
|
else()
|
|
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU)
|
|
endif()
|
|
endif()
|
|
|
|
if(SELECTED_OP_LIST)
|
|
if(NOT USE_STATIC_DISPATCH AND NOT OP_DEPENDENCY)
|
|
message(FATAL_ERROR "Must provide op dependency graph .yaml file for custom build with dynamic dispatch!")
|
|
endif()
|
|
execute_process(
|
|
COMMAND
|
|
"${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_whitelist.py
|
|
--op-dependency "${OP_DEPENDENCY}"
|
|
--root-ops "${SELECTED_OP_LIST}"
|
|
OUTPUT_VARIABLE OP_REGISTRATION_WHITELIST
|
|
)
|
|
separate_arguments(OP_REGISTRATION_WHITELIST)
|
|
message(STATUS "Custom build with op registration whitelist: ${OP_REGISTRATION_WHITELIST}")
|
|
list(APPEND CUSTOM_BUILD_FLAGS
|
|
--force_schema_registration
|
|
--op_registration_whitelist ${OP_REGISTRATION_WHITELIST})
|
|
endif()
|
|
if(USE_VULKAN)
|
|
set(GEN_VULKAN_FLAGS --vulkan)
|
|
endif()
|
|
|
|
set(GEN_COMMAND
|
|
"${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/gen.py
|
|
--source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
|
|
--install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
|
|
${GEN_ROCM_FLAG}
|
|
${cwrap_files}
|
|
${CUSTOM_BUILD_FLAGS}
|
|
${GEN_VULKAN_FLAGS}
|
|
)
|
|
|
|
execute_process(
|
|
COMMAND ${GEN_COMMAND}
|
|
--output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt
|
|
RESULT_VARIABLE RETURN_VALUE
|
|
)
|
|
if(NOT RETURN_VALUE EQUAL 0)
|
|
message(STATUS ${generated_cpp})
|
|
message(FATAL_ERROR "Failed to get generated_cpp list")
|
|
endif()
|
|
# FIXME: the file/variable name lists cpp, but these list both cpp and .h files
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-core core_generated_cpp)
|
|
|
|
file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
|
|
|
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
|
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core)
|
|
|
|
add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} ${core_generated_cpp}
|
|
COMMAND ${GEN_COMMAND}
|
|
DEPENDS ${all_python} ${all_templates} ${cwrap_files})
|
|
|
|
# Generated headers used from a CUDA (.cu) file are
|
|
# not tracked correctly in CMake. We make the libATen.so depend explicitly
|
|
# on building the generated ATen files to workaround.
|
|
add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp} ${core_generated_cpp})
|
|
add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_cpp})
|
|
add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
|
|
add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
|
|
add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
|
|
add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
|
|
endif()
|
|
|
|
function(append_filelist name outputvar)
|
|
set(_rootdir "${${CMAKE_PROJECT_NAME}_SOURCE_DIR}/")
|
|
# configure_file adds its input to the list of CMAKE_RERUN dependencies
|
|
configure_file(
|
|
${CMAKE_SOURCE_DIR}/tools/build_variables.bzl
|
|
${CMAKE_BINARY_DIR}/caffe2/build_variables.bzl)
|
|
execute_process(
|
|
COMMAND "${PYTHON_EXECUTABLE}" -c
|
|
"exec(open('tools/build_variables.bzl').read());print(';'.join(['${_rootdir}' + x for x in ${name}]))"
|
|
WORKING_DIRECTORY "${_rootdir}"
|
|
RESULT_VARIABLE _retval
|
|
OUTPUT_VARIABLE _tempvar)
|
|
if(NOT _retval EQUAL 0)
|
|
message(FATAL_ERROR "Failed to fetch filelist ${name} from build_variables.bzl")
|
|
endif()
|
|
string(REPLACE "\n" "" _tempvar "${_tempvar}")
|
|
list(APPEND ${outputvar} ${_tempvar})
|
|
set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
|
|
endfunction()
|