pytorch/cmake/Modules/FindAPL.cmake
Stefan-Alin Pahontu 0674ab7e33 solve apl dependency issue (#145215)
According to the [APL documentation](https://developer.arm.com/documentation/101004/2404/General-information/Arm-Performance-Libraries-example-programs), libraries ending with _mp are OpenMP multi-threaded libraries.

When a project is compiled with MSVC and the -openmp flag, the vcomp library (Visual C++ implementation of OpenMP) is used for runtime calls.

However, the current APL implementation uses the libomp.dll (LLVM) variant.

As a result, there are unexpected behaviors at runtime.

---

For Example:

```python
import torch

# Create a sparse tensor
# Input (Sparse Tensor):
# [[0, 1],
#  [1, 0]]
indices = torch.tensor([[0, 1], [1, 0]])
values = torch.tensor([1, 1], dtype=torch.float32)
size = torch.Size([2, 2])

sparse_tensor = torch.sparse_coo_tensor(indices, values, size)

# Convert sparse tensor to dense tensor
dense_tensor = sparse_tensor.to_dense()

# Expected Output (Dense Tensor):
# [[0, 1],
#  [1, 0]]
print("\nDense Tensor:")
print(dense_tensor)
```

However, it prints unexpected outputs such as:

```python
# [[0, 11],
#  [10, 0]]
```

The issue arises because the following code does not function as expected at runtime:

https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/ParallelOpenMP.h#L30

```c++
// returns 1 , however since OpenMP is enabled it should return total number of threads
int64_t num_threads = omp_get_num_threads();
```

---

In the runtime, loading multiple OpenMP libraries (in this case `libomp` and `vcomp`) is causing unexpected behaviours.

So, we've changed libraries from `_mp` to non `_mp` versions and we used `vcomp` for OpenMP calls.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/145215
Approved by: https://github.com/ozanMSFT, https://github.com/malfet

Co-authored-by: Ozan Aydin <148207261+ozanMSFT@users.noreply.github.com>
2025-01-27 13:02:16 +00:00

74 lines
2.4 KiB
CMake

# - Find APL (Arm Performance Libraries)
#
# This module sets the following variables:
# APL_INCLUDE_SEARCH_PATHS - list of paths to search for APL include files
# APL_LIB_SEARCH_PATHS - list of paths to search for APL libraries
# APL_FOUND - set to true if APL is found
# APL_INCLUDE_DIR - path to include dir.
# APL_LIB_DIR - path to include dir.
# APL_LIBRARIES - list of libraries for base APL
SET(APL_INCLUDE_SEARCH_PATHS $ENV{ARMPL_DIR}/include)
SET(APL_LIB_SEARCH_PATHS $ENV{ARMPL_DIR}/lib)
SET(APL_BIN_SEARCH_PATHS $ENV{ARMPL_DIR}/bin)
SET(APL_FOUND ON)
# Check include file
FIND_PATH(APL_INCLUDE_DIR NAMES armpl.h PATHS ${APL_INCLUDE_SEARCH_PATHS})
IF(NOT APL_INCLUDE_DIR)
SET(APL_FOUND OFF)
MESSAGE(STATUS "Could not verify APL include directory. Turning APL_FOUND off")
ENDIF()
# Check lib file
FIND_PATH(APL_LIB_DIR NAMES armpl_lp64.dll.lib libarmpl_lp64.a PATHS ${APL_LIB_SEARCH_PATHS})
IF(NOT APL_LIB_DIR)
SET(APL_FOUND OFF)
MESSAGE(STATUS "Could not verify APL lib directory. Turning APL_FOUND off")
ENDIF()
# Check bin file
FIND_PATH(APL_BIN_DIR NAMES armpl_lp64.dll libarmpl_lp64.a PATHS ${APL_BIN_SEARCH_PATHS})
IF(NOT APL_BIN_DIR)
SET(APL_FOUND OFF)
MESSAGE(STATUS "Could not verify APL bin directory. Turning APL_FOUND off")
ENDIF()
IF (APL_FOUND)
IF(WIN32)
set(APL_LIBRARIES
"${APL_LIB_DIR}/armpl_lp64.dll.lib"
)
set(APL_DLLS
"${CMAKE_INSTALL_PREFIX}/lib/armpl_lp64.dll"
)
add_custom_command(
OUTPUT ${APL_DLLS}
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_INSTALL_PREFIX}/lib"
COMMAND ${CMAKE_COMMAND} -E copy_if_different "${APL_BIN_DIR}/armpl_lp64.dll" "${CMAKE_INSTALL_PREFIX}/lib/armpl_lp64.dll"
)
add_custom_target(copy_apl_dlls ALL DEPENDS ${APL_DLLS})
ELSEIF(UNIX)
set(APL_LIBRARIES
"${APL_LIB_DIR}/libarmpl_lp64.a"
)
ENDIF()
MESSAGE(STATUS "Found APL header: ${APL_INCLUDE_DIR}")
MESSAGE(STATUS "Found APL library: ${APL_LIB_DIR}")
message(STATUS "APL_LIBRARIES: ${APL_LIBRARIES}")
SET(CMAKE_REQUIRED_LIBRARIES ${APL_LIBRARIES})
include(CheckCSourceRuns)
CHECK_C_SOURCE_RUNS("
#include <stdlib.h>
#include <stdio.h>
float x[4] = { 1, 2, 3, 4 };
float y[4] = { .1, .01, .001, .0001 };
extern float cblas_sdot();
int main() {
int i;
double r = cblas_sdot(4, x, 1, y, 1);
exit((float)r != (float).1234);
}" BLAS_USE_CBLAS_DOT )
MESSAGE(STATUS "BLAS_USE_CBLAS_DOT: ${BLAS_USE_CBLAS_DOT}")
ENDIF (APL_FOUND)