pytorch/cmake/Modules/FindNCCL.cmake
suffian khan 92b95e5243 Fix NCCL version check when nccl.h in non-standard location. (#40982)
Summary:
The NCCL discovery process fails to compile detect_nccl_version.cc when nccl.h resides in a non-standard location.
Pass __NCCL_INCLUDE_DIRS__ to _try_run(... detect_nccl_version.cc)_ to fix this.

Can reproduce with Dockerfile ..
```Dockerfile
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as build
WORKDIR /stage

# install conda
ARG CONDA_VERSION=4.7.10
ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh
RUN cd /stage && curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh &&\
    /bin/bash ./install-conda.sh -b -p /opt/conda &&\
    /opt/conda/bin/conda clean -ya
ENV PATH=/opt/conda/bin:${PATH}

# install prerequisites
RUN conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi

# attempt compile
ENV CUDA_HOME="/usr/local/cuda" \
    CUDNN_LIBRARY="/usr/lib/x86_64-linux-gnu" \
    NCCL_INCLUDE_DIR="/usr/local/cuda/include" \
    NCCL_LIB_DIR="/usr/local/cuda/lib64" \
    USE_SYSTEM_NCCL=1
RUN apt-get -y update &&\
    apt-get -y install git &&\
    cd /stage && git clone https://github.com/pytorch/pytorch.git &&\
    cd pytorch &&\
    git submodule update --init --recursive &&\
    python setup.py bdist_wheel
```

This generates the following error ..
```
-- Found NCCL: /usr/local/cuda/include
-- Determining NCCL version from /usr/local/cuda/include/nccl.h...
-- Looking for NCCL_VERSION_CODE
-- Looking for NCCL_VERSION_CODE - found
CMake Error at cmake/Modules/FindNCCL.cmake:78 (message):
  Found NCCL header version and library version do not match! (include:
  /usr/local/cuda/include, library: /usr/local/cuda/lib64/libnccl.so) Please
  set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/40982

Reviewed By: zou3519

Differential Revision: D22603911

Pulled By: malfet

fbshipit-source-id: 084870375a270fb9c7daf3c2e731992a03614ad6
2020-07-17 13:54:17 -07:00

91 lines
3.6 KiB
CMake

# Find the nccl libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are found
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601
set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
if ($ENV{NCCL_ROOT_DIR})
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
endif()
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})
if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()
find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <nccl.h>
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
else()
message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()