Revert "Add cuda-11.3+clang9 build workflow"

This reverts commit 709fcc862e. Reverted https://github.com/pytorch/pytorch/pull/75293 on behalf of https://github.com/janeyx99
2025-12-06 12:20:52 +01:00 · 2022-04-11 15:24:59 +00:00 · 2022-04-11 15:24:59 +00:00 · 8fe43d76d5
commit 8fe43d76d5
parent 1a85699c03
11 changed files with 11 additions and 36 deletions
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@ -134,18 +134,6 @@ case "$image" in
    VISION=yes
    KATEX=yes
    ;;
-  pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)
-    CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
-    CUDNN_VERSION=8
-    TENSORRT_VERSION=8.0.1.6
-    ANACONDA_PYTHON_VERSION=3.7
-    CMAKE_VERSION=3.10.3
-    CLANG_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    ;;
  pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7)
    CUDA_VERSION=11.5.0
    CUDNN_VERSION=8
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -25,7 +25,6 @@ jobs:
      matrix:
        include:
          - docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
-          - docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
          - docker-image-name: pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7
          - docker-image-name: pytorch-linux-bionic-py3.7-clang9
          - docker-image-name: pytorch-linux-bionic-rocm4.5-py3.7
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -135,13 +135,6 @@ jobs:
          { config: "noarch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

-  linux-bionic-cuda11_3-py3_7-clang9-build:
-    name: linux-bionic-cuda11.3-py3.7-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.3-py3.7-clang9
-      docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
-
  linux-vulkan-bionic-py3_7-clang9-build:
    name: linux-vulkan-bionic-py3.7-clang9
    uses: ./.github/workflows/_linux-build.yml
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@ -79,7 +79,7 @@ void binomial_cuda_kernel(
  using accscalar_t = at::acc_type<scalar_t, true>;

  at::native::distribution_binary_kernel(iter, philox_args,
-      [] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
+      [philox_args] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
        #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
        auto uniform_lambda = curand_uniform_wrapper(state);
        BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
--- a/benchmarks/cpp/nvfuser/layer_norm.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm.cpp
@ -90,9 +90,9 @@ static void Baseline_LayerNorm(

  std::vector<int64_t> input_shape{
      benchmark_state.range(0), benchmark_state.range(1)};
-  const size_t kReductionAxis = 1;
+  const int kReductionAxis = 1;
  std::vector<int64_t> norm_shape;
-  for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
+  for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) {
    norm_shape.push_back(input_shape[idx]);
  }

--- a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
@ -115,9 +115,9 @@ static void Baseline_LayerNorm_BWD(

  std::vector<int64_t> input_shape{
      benchmark_state.range(0), benchmark_state.range(1)};
-  const size_t kReductionAxis = 1;
+  const int kReductionAxis = 1;
  std::vector<int64_t> norm_shape;
-  for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
+  for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) {
    norm_shape.push_back(input_shape[idx]);
  }

--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@ -38,12 +38,6 @@ endif()

 # Enable CUDA language support
 set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}")
-# Pass clang as host compiler, which according to the docs
-# Must be done before CUDA language is enabled, see  mast be done before
-# see https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html
-if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}")
-endif()
 enable_language(CUDA)
 set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -470,7 +470,8 @@ function(torch_compile_options libname)
      if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
        list(APPEND private_compile_options
          -Wno-range-loop-analysis)
-      else()
+      endif()
+      if(NOT APPLE)
        list(APPEND private_compile_options
          # Considered to be flaky.  See the discussion at
          # https://github.com/pytorch/pytorch/pull/9608
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@ -508,6 +508,7 @@ void testReduceScatter(const std::string& path, int rank, int size) {
 void testProcessGroupNCCLHealthCheckFailHelper(const std::string& path, bool timeout) {
  // simulate world_size > 1 here via threads.
  const int worldSize = 4;
+  std::mutex m;
  std::unordered_set<uint64_t> nums;
  auto runTest = [&](int i) {
    NCCLTest test(path, worldSize, std::chrono::milliseconds(3000));
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@ -143,7 +143,6 @@ if(USE_CUDA)
    ${TORCH_CUDA_LIBRARIES})

  target_compile_definitions(test_jit PRIVATE USE_CUDA)
-  target_compile_options(test_jit PRIVATE -Wno-sign-compare)
 elseif(USE_ROCM)
  target_link_libraries(test_jit PRIVATE
    ${ROCM_HIPRTC_LIB}
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@ -697,7 +697,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
        "Tensor's dim 0 does not divide equally across group size");

    std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-        [this](std::unique_ptr<WorkEntry>& entry) {
+        [opts, this](std::unique_ptr<WorkEntry>& entry) {
          auto srcdata = (entry->src)[0];
          auto dstdata = (entry->dst)[0];
          c10::DeviceGuard guard(srcdata.device());
@ -724,7 +724,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
    c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
    c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
    std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-        [this, inputSplitSizes, outputSplitSizes](
+        [opts, this, inputSplitSizes, outputSplitSizes](
            std::unique_ptr<WorkEntry>& entry) {
          auto srcdata = (entry->src)[0];
          auto dstdata = (entry->dst)[0];
@ -771,7 +771,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
      outputTensors.size() == size_,
      "Number of output tensors are not equal to group size");
  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [this](std::unique_ptr<WorkEntry>& entry) {
+      [opts, this](std::unique_ptr<WorkEntry>& entry) {
        std::vector<int> send_lengths(size_);
        std::vector<int> recv_lengths(size_);
        std::vector<int> send_offsets(size_);