mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Revert "Add cuda-11.3+clang9 build workflow"
This reverts commit 709fcc862e.
Reverted https://github.com/pytorch/pytorch/pull/75293 on behalf of https://github.com/janeyx99
This commit is contained in:
parent
1a85699c03
commit
8fe43d76d5
|
|
@ -134,18 +134,6 @@ case "$image" in
|
|||
VISION=yes
|
||||
KATEX=yes
|
||||
;;
|
||||
pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)
|
||||
CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
|
||||
CUDNN_VERSION=8
|
||||
TENSORRT_VERSION=8.0.1.6
|
||||
ANACONDA_PYTHON_VERSION=3.7
|
||||
CMAKE_VERSION=3.10.3
|
||||
CLANG_VERSION=9
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
;;
|
||||
pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7)
|
||||
CUDA_VERSION=11.5.0
|
||||
CUDNN_VERSION=8
|
||||
|
|
|
|||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
|
|
@ -25,7 +25,6 @@ jobs:
|
|||
matrix:
|
||||
include:
|
||||
- docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
|
||||
- docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
|
||||
- docker-image-name: pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7
|
||||
- docker-image-name: pytorch-linux-bionic-py3.7-clang9
|
||||
- docker-image-name: pytorch-linux-bionic-rocm4.5-py3.7
|
||||
|
|
|
|||
7
.github/workflows/pull.yml
vendored
7
.github/workflows/pull.yml
vendored
|
|
@ -135,13 +135,6 @@ jobs:
|
|||
{ config: "noarch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
|
||||
]}
|
||||
|
||||
linux-bionic-cuda11_3-py3_7-clang9-build:
|
||||
name: linux-bionic-cuda11.3-py3.7-clang9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-bionic-cuda11.3-py3.7-clang9
|
||||
docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
|
||||
|
||||
linux-vulkan-bionic-py3_7-clang9-build:
|
||||
name: linux-vulkan-bionic-py3.7-clang9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ void binomial_cuda_kernel(
|
|||
using accscalar_t = at::acc_type<scalar_t, true>;
|
||||
|
||||
at::native::distribution_binary_kernel(iter, philox_args,
|
||||
[] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
|
||||
[philox_args] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
|
||||
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
|
||||
auto uniform_lambda = curand_uniform_wrapper(state);
|
||||
BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
|
||||
|
|
|
|||
|
|
@ -90,9 +90,9 @@ static void Baseline_LayerNorm(
|
|||
|
||||
std::vector<int64_t> input_shape{
|
||||
benchmark_state.range(0), benchmark_state.range(1)};
|
||||
const size_t kReductionAxis = 1;
|
||||
const int kReductionAxis = 1;
|
||||
std::vector<int64_t> norm_shape;
|
||||
for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
|
||||
for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) {
|
||||
norm_shape.push_back(input_shape[idx]);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -115,9 +115,9 @@ static void Baseline_LayerNorm_BWD(
|
|||
|
||||
std::vector<int64_t> input_shape{
|
||||
benchmark_state.range(0), benchmark_state.range(1)};
|
||||
const size_t kReductionAxis = 1;
|
||||
const int kReductionAxis = 1;
|
||||
std::vector<int64_t> norm_shape;
|
||||
for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
|
||||
for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) {
|
||||
norm_shape.push_back(input_shape[idx]);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,12 +38,6 @@ endif()
|
|||
|
||||
# Enable CUDA language support
|
||||
set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}")
|
||||
# Pass clang as host compiler, which according to the docs
|
||||
# Must be done before CUDA language is enabled, see mast be done before
|
||||
# see https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html
|
||||
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||
set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}")
|
||||
endif()
|
||||
enable_language(CUDA)
|
||||
set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
|
||||
|
|
|
|||
|
|
@ -470,7 +470,8 @@ function(torch_compile_options libname)
|
|||
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||
list(APPEND private_compile_options
|
||||
-Wno-range-loop-analysis)
|
||||
else()
|
||||
endif()
|
||||
if(NOT APPLE)
|
||||
list(APPEND private_compile_options
|
||||
# Considered to be flaky. See the discussion at
|
||||
# https://github.com/pytorch/pytorch/pull/9608
|
||||
|
|
|
|||
|
|
@ -508,6 +508,7 @@ void testReduceScatter(const std::string& path, int rank, int size) {
|
|||
void testProcessGroupNCCLHealthCheckFailHelper(const std::string& path, bool timeout) {
|
||||
// simulate world_size > 1 here via threads.
|
||||
const int worldSize = 4;
|
||||
std::mutex m;
|
||||
std::unordered_set<uint64_t> nums;
|
||||
auto runTest = [&](int i) {
|
||||
NCCLTest test(path, worldSize, std::chrono::milliseconds(3000));
|
||||
|
|
|
|||
|
|
@ -143,7 +143,6 @@ if(USE_CUDA)
|
|||
${TORCH_CUDA_LIBRARIES})
|
||||
|
||||
target_compile_definitions(test_jit PRIVATE USE_CUDA)
|
||||
target_compile_options(test_jit PRIVATE -Wno-sign-compare)
|
||||
elseif(USE_ROCM)
|
||||
target_link_libraries(test_jit PRIVATE
|
||||
${ROCM_HIPRTC_LIB}
|
||||
|
|
|
|||
|
|
@ -697,7 +697,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
|
|||
"Tensor's dim 0 does not divide equally across group size");
|
||||
|
||||
std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
|
||||
[this](std::unique_ptr<WorkEntry>& entry) {
|
||||
[opts, this](std::unique_ptr<WorkEntry>& entry) {
|
||||
auto srcdata = (entry->src)[0];
|
||||
auto dstdata = (entry->dst)[0];
|
||||
c10::DeviceGuard guard(srcdata.device());
|
||||
|
|
@ -724,7 +724,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
|
|||
c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
|
||||
c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
|
||||
std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
|
||||
[this, inputSplitSizes, outputSplitSizes](
|
||||
[opts, this, inputSplitSizes, outputSplitSizes](
|
||||
std::unique_ptr<WorkEntry>& entry) {
|
||||
auto srcdata = (entry->src)[0];
|
||||
auto dstdata = (entry->dst)[0];
|
||||
|
|
@ -771,7 +771,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
|
|||
outputTensors.size() == size_,
|
||||
"Number of output tensors are not equal to group size");
|
||||
std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
|
||||
[this](std::unique_ptr<WorkEntry>& entry) {
|
||||
[opts, this](std::unique_ptr<WorkEntry>& entry) {
|
||||
std::vector<int> send_lengths(size_);
|
||||
std::vector<int> recv_lengths(size_);
|
||||
std::vector<int> send_offsets(size_);
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user