Merge pull request #48507 from ROCmSoftwarePlatform/google_upstream_r25_port_pr_48187

[r2.5 port][ROCm] Port PR 48187 to r2.5
2025-12-06 12:20:11 +01:00 · 2021-04-22 15:26:58 -07:00 · 2021-04-22 15:26:58 -07:00 · 8e0516e743
commit 8e0516e743
parent 95d88654e0 92bbb09e77
24 changed files with 218 additions and 98 deletions
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@ -810,9 +810,9 @@ Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
 // upstream commit), the following mapping will need to change
 std::string MapGCNArchNameTokenToFeatureStr(const std::string& token) {
  if (token == "sramecc+") {
-    return "+sram-ecc";
+    return "+sramecc";
  } else if (token == "sramecc-") {
-    return "-sram-ecc";
+    return "-sramecc";
  } else if (token == "xnack+") {
    return "+xnack";
  } else if (token == "xnack-") {
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@ -1196,6 +1196,7 @@ xla_test(
    ],
    shard_count = 50,
    tags = [
+        "no_rocm",
        "optonly",
    ],
    deps = CONVOLUTION_TEST_DEPS + [
@ -1261,6 +1262,7 @@ xla_test(
    backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
    backends = ["gpu"],
    shard_count = 25,
+    tags = ["no_rocm"],
    deps = CONVOLUTION_TEST_DEPS + [
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@ -1092,7 +1092,6 @@ cuda_py_test(
    tags = [
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
-        "no_rocm",
        "notsan",  # b/173031470
    ],
    deps = [
@ -1232,7 +1231,6 @@ distribute_py_test(
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
        "no_oss",  # b/178656226
-        "no_rocm",
        "noasan",  # b/175816710
        "notsan",  # b/168645872
    ],
@ -1290,7 +1288,6 @@ distribute_py_test(
    main = "distribute_utils_test.py",
    tags = [
        "multi_and_single_gpu",
-        "no_rocm",
    ],
    deps = [
        ":combinations",
@ -1318,7 +1315,6 @@ distribute_py_test(
    tags = [
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
-        "no_rocm",
    ],
    tpu_tags = [
        "no_oss",  # b/150954621 Target too big to run serially reliably.
@ -1769,7 +1765,6 @@ distribute_py_test(
    shard_count = 2,
    tags = [
        "multi_and_single_gpu",
-        "no_rocm",
        "notsan",  # TODO(b/160006974)
    ],
    xla_enable_strict_auto_jit = True,
@ -1802,7 +1797,6 @@ distribute_py_test(
    tags = [
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
-        "no_rocm",
        "notsan",  # TODO(b/160006974)
    ],
    xla_enable_strict_auto_jit = True,
@ -1876,7 +1870,6 @@ distribute_py_test(
    disable_mlir_bridge = False,
    tags = [
        "multi_and_single_gpu",
-        "no_rocm",
    ],
    deps = [
        ":combinations",
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@ -248,7 +248,6 @@ distribute_py_test(
    main = "custom_training_loop_metrics_test.py",
    tags = [
        "multi_and_single_gpu",
-        "no_rocm",
    ],
    deps = [
        ":strategy_combinations",
@ -270,7 +269,6 @@ distribute_py_test(
    tags = [
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
-        "no_rocm",
        "notsan",  # TODO(b/170954243)
    ],
    tpu_tags = [
@ -536,7 +534,7 @@ distribute_py_test(

 distribute_py_test(
    name = "keras_rnn_model_correctness_test",
-    size = "medium",
+    size = "large",
    srcs = ["keras_rnn_model_correctness_test.py"],
    full_precision = True,
    main = "keras_rnn_model_correctness_test.py",
@ -545,7 +543,7 @@ distribute_py_test(
    shard_count = 31,
    tags = [
        "multi_and_single_gpu",
-        "no_rocm",
+        "no_rocm",  # Would require size large, but that effectively disables the test for presubmits.
        "no_windows_gpu",
        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
        "notpu",  # TODO(b/153672562)
@ -605,7 +603,6 @@ distribute_py_test(
    tags = [
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
-        "no_rocm",
        "no_windows_gpu",
        "notsan",
    ],
@ -825,6 +822,7 @@ distribute_py_test(
    tags = [
        "multi_and_single_gpu",
        "no_cuda_asan",  # times out
+        "no_rocm",
    ],
    xla_tags = [
        "no_cuda_asan",  # times out
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -1735,7 +1735,6 @@ cuda_py_test(
    name = "betainc_op_test",
    size = "small",
    srcs = ["betainc_op_test.py"],
-    tags = ["no_rocm"],  # ROCm 3.9 regression
    xla_tags = [
        "no_cuda_asan",  # times out
    ],
@ -3238,6 +3237,7 @@ cuda_py_test(
    srcs = ["extract_image_patches_grad_test.py"],
    shard_count = 15,
    tags = [
+        "no_rocm",
        "nomac",  # b/181799478
        "notap",  # b/31080670
    ],
@ -3600,6 +3600,7 @@ cuda_py_test(
    size = "medium",
    srcs = ["tensordot_op_test.py"],
    shard_count = 20,
+    tags = ["no_rocm"],
    xla_enable_strict_auto_jit = False,  # b/161856380
    deps = [
        "//tensorflow/python:array_ops",
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@ -61,7 +61,6 @@ cuda_py_test(
    size = "small",
    srcs = ["beta_test.py"],
    tags = [
-        "no_rocm",  # ROCm 3.9 regression
        "notsan",  # b/173653918
    ],
    xla_tags = [
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@ -41,7 +41,6 @@ cuda_py_test(
    main = "csr_sparse_matrix_ops_test.py",
    shard_count = 10,
    tags = [
-        "no_rocm",  # ROCm 3.8 regression
        "notsan",  # b/149115441
    ],
    deps = [
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@ -33,7 +33,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
-from tensorflow.python.platform import test


@test_util.run_all_in_graph_and_eager_modes
@ -59,11 +58,11 @@ class ReduceTest(test_util.TensorFlowTestCase):
    x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
    with test_util.device(use_gpu=True):
      for axis in (0, -2):
-        self.assertAllEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)),
-                            [5, 7, 9])
+        self.assertAllEqual(
+            self.evaluate(math_ops.reduce_sum(x, axis=axis)), [5, 7, 9])
      for axis in (1, -1):
-        self.assertAllEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)),
-                            [6, 15])
+        self.assertAllEqual(
+            self.evaluate(math_ops.reduce_sum(x, axis=axis)), [6, 15])
      for axis in (None, (0, 1), (1, 0), (-1, 0), (0, -1), (-2, 1), (1, -2),
                   (-1, -2), (-2, -1)):
        self.assertEqual(self.evaluate(math_ops.reduce_sum(x, axis=axis)), 21)
@ -358,8 +357,8 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
    indices = constant_op.constant([0, 2, 5])
    x = math_ops.scalar_mul(-3, ops.IndexedSlices(values, indices))
    with test_util.device(use_gpu=True):
-      self.assertAllEqual(self.evaluate(x.values),
-                          [[-6, -9], [-15, -21], [0, 3]])
+      self.assertAllEqual(
+          self.evaluate(x.values), [[-6, -9], [-15, -21], [0, 3]])
      self.assertAllEqual(self.evaluate(x.indices), [0, 2, 5])


@ -436,9 +435,11 @@ class AddNTest(test_util.TensorFlowTestCase):

  def test_iterable(self):
    """Test that add_n supports iterables (e.g. generators and dict values)."""
+
    def fn():
      yield 1
      yield 2
+
    values_dict = {"a": 1, "b": 2}
    with test_util.use_gpu():
      self.assertAllEqual(3, math_ops.add_n(fn()))
@ -483,8 +484,9 @@ class DivAndModTest(test_util.TensorFlowTestCase):

  def testFloorModBfloat16(self):
    nums, divs = self.floatTestData()
-    tf_result = math_ops.floormod(math_ops.cast(nums, dtypes.bfloat16),
-                                  math_ops.cast(divs, dtypes.bfloat16))
+    tf_result = math_ops.floormod(
+        math_ops.cast(nums, dtypes.bfloat16),
+        math_ops.cast(divs, dtypes.bfloat16))
    np_result = nums % divs
    self.assertAllEqual(tf_result, np_result)

@ -742,10 +744,8 @@ class NextAfterTest(test_util.TensorFlowTestCase):

      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps)
      self.assertAllLess(math_ops.nextafter(one, zero) - one, 0)
-      self.assertAllEqual(
-          math_ops.is_nan(math_ops.nextafter(nan, one)), [True])
-      self.assertAllEqual(
-          math_ops.is_nan(math_ops.nextafter(one, nan)), [True])
+      self.assertAllEqual(math_ops.is_nan(math_ops.nextafter(nan, one)), [True])
+      self.assertAllEqual(math_ops.is_nan(math_ops.nextafter(one, nan)), [True])
      self.assertAllEqual(math_ops.nextafter(one, one), one)

  def testBroadcasting(self):
@ -786,13 +786,13 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
          r"Attempt to convert a value .* with an unsupported type")
    else:
      error = TypeError
-      error_message = (
-          r"Failed to convert object of type .* to Tensor")
+      error_message = (r"Failed to convert object of type .* to Tensor")

    class RHSReturnsTrue(object):

      def __radd__(self, other):
        return True
+
    a = array_ops.ones([1], dtype=dtypes.int32) + RHSReturnsTrue()
    self.assertEqual(a, True)

@ -889,12 +889,6 @@ class RangeTest(test_util.TensorFlowTestCase):
 class ErfcinvTest(test_util.TensorFlowTestCase):

  def testErfcinv(self):
-    if test.is_built_with_rocm():
-      # The implementation of erfcinv calls ndtri op,
-      # and the ROCm implementaion for ndtri op has a known bug in it
-      # whose fix will be in a forthcoming ROCm release (4.0 ?).
-      # Need to skip this unit-test until that ROCm release is out
-      self.skipTest("ndtri op implementation is buggy on ROCm")
    values = np.random.uniform(0.1, 1.9, size=int(1e4)).astype(np.float32)
    approx_id = math_ops.erfc(math_ops.erfcinv(values))
    self.assertAllClose(values, self.evaluate(approx_id))
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@ -81,14 +81,7 @@ class MathTest(PForTestCase, parameterized.TestCase):
    ]
    self._test_unary_cwise_ops(complex_ops, True)

-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="This fails on ROCm.")
  def test_unary_cwise_real_ops_1(self):
-    if test.is_built_with_rocm():
-      # TODO(rocm):
-      # This fails on ROCm...see JIRA ticket 236756
-      self.skipTest("Fails on ROCM")
-
    real_ops = [
        lambda x: math_ops.acosh(1 + math_ops.square(x)),
        math_ops.abs,
@ -691,15 +684,15 @@ class LinalgTest(PForTestCase):
      self._test_loop_fn(loop_fn, 3)

  def test_matrix_inverse(self):
-    x = (random_ops.random_uniform([3, 4, 2, 2]) +
-         10 * linalg_ops.eye(2))  # Ensure well-conditioned.
+    x = (random_ops.random_uniform([3, 4, 2, 2]) + 10 * linalg_ops.eye(2)
+        )  # Ensure well-conditioned.

    for adjoint in (True, False):

      # pylint: disable=cell-var-from-loop
      def loop_fn(i):
-        return linalg_ops.matrix_inverse(array_ops.gather(x, i),
-                                         adjoint=adjoint)
+        return linalg_ops.matrix_inverse(
+            array_ops.gather(x, i), adjoint=adjoint)

      # pylint: enable=cell-var-from-loop
      self._test_loop_fn(loop_fn, 2)
@ -710,8 +703,8 @@ class LinalgTest(PForTestCase):
        for stack_b in (True, False):
          shape_a = (2, 4, 3, 3) if stack_a else (4, 3, 3)
          shape_b = (2, 4, 3, 5) if stack_b else (4, 3, 5)
-          x = (random_ops.random_uniform(shape_a) +
-               10 * linalg_ops.eye(3))  # Ensure well-conditioned.
+          x = (random_ops.random_uniform(shape_a) + 10 * linalg_ops.eye(3)
+              )  # Ensure well-conditioned.
          y = random_ops.random_uniform(shape_b)

          # pylint: disable=cell-var-from-loop
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@ -139,11 +139,6 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
      ]
      )  # pyformat: disable
  def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
-    if test_util.IsBuiltWithROCm():
-      # TODO(rocm):
-      # This fails on ROCm...see JIRA ticket 236756
-      self.skipTest('Fails on ROCM')
-
    result = op(x, **extra_args)

    # Run the wrapped op on the dense values, for comparison.
@ -319,7 +314,9 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
               ragged_factory_ops.constant_value([['foo', 'bar'], ['baz']]),
               ragged_factory_ops.constant_value([['2', '9'], ['12']]))},
      ])  # pyformat: disable
-  def testListValuedElementwiseOp(self, inputs, op=math_ops.add_n,
+  def testListValuedElementwiseOp(self,
+                                  inputs,
+                                  op=math_ops.add_n,
                                  **extra_args):
    use_kwargs = extra_args.pop('use_kwargs', False)
    if use_kwargs:
@ -676,13 +673,20 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
          expected=ragged_factory_ops.constant_value([[5, 4], [3, 2, 1]])),
      dict(
          op=string_ops.string_format,
-          kwargs={'template': 'Hi {}',
-                  'inputs': [ragged_factory_ops.constant_value([[1, 2], [3]])]},
+          kwargs={
+              'template': 'Hi {}',
+              'inputs': [ragged_factory_ops.constant_value([[1, 2], [3]])]
+          },
          expected='Hi [[1, 2], [3]]'),
  ])
-  def testRaggedDispatch(self, op, expected, args=(), result_is_list=False,
+  def testRaggedDispatch(self,
+                         op,
+                         expected,
+                         args=(),
+                         result_is_list=False,
                         kwargs=None):
-    if kwargs is None: kwargs = {}
+    if kwargs is None:
+      kwargs = {}
    result = op(*args, **kwargs)
    if result_is_list:
      self.assertLen(result, len(expected))
@ -694,15 +698,13 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
  def testUnaryElementwiseOpsPreserveUniformRowLength(self):
    # Unary elementwise op
    rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
-        ragged_factory_ops.constant([[1, 2], [3]]),
-        uniform_row_length=2)
+        ragged_factory_ops.constant([[1, 2], [3]]), uniform_row_length=2)
    self.assertAllEqual(rt.uniform_row_length,
                        array_ops.zeros_like(rt).uniform_row_length)

    # Unary-list elementwise op
    rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
-        ragged_factory_ops.constant([[1, 2], [3]]),
-        uniform_row_length=2)
+        ragged_factory_ops.constant([[1, 2], [3]]), uniform_row_length=2)
    self.assertAllEqual(rt.uniform_row_length,
                        math_ops.add_n([rt, rt]).uniform_row_length)

--- a/tensorflow/stream_executor/platform/default/BUILD
+++ b/tensorflow/stream_executor/platform/default/BUILD
@ -31,6 +31,7 @@ cc_library(
        "//tensorflow/stream_executor/platform",
        "@com_google_absl//absl/strings",
        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_rocm//rocm:rocm_headers",
        "@local_config_tensorrt//:tensorrt_headers",
    ],
 )
--- a/tensorflow/stream_executor/platform/default/dlopen_checker.cc
+++ b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
@ -45,7 +45,7 @@ port::Status TryDlopenCUDALibraries() {
 port::Status TryDlopenROCmLibraries() {
  auto rocblas_status = GetRocblasDsoHandle();
  auto miopen_status = GetMiopenDsoHandle();
-  auto rocfft_status = GetRocfftDsoHandle();
+  auto rocfft_status = GetHipfftDsoHandle();
  auto rocrand_status = GetRocrandDsoHandle();
  if (!rocblas_status.status().ok() || !miopen_status.status().ok() ||
      !rocfft_status.status().ok() || !rocrand_status.status().ok()) {
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "third_party/tensorrt/tensorrt_config.h"

+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
+
 namespace stream_executor {
 namespace internal {

@ -133,8 +137,12 @@ port::StatusOr<void*> GetMiopenDsoHandle() {
  return GetDsoHandle("MIOpen", "");
 }

-port::StatusOr<void*> GetRocfftDsoHandle() {
+port::StatusOr<void*> GetHipfftDsoHandle() {
+#if TF_ROCM_VERSION < 40100
  return GetDsoHandle("rocfft", "");
+#else
+  return GetDsoHandle("hipfft", "");
+#endif
 }

 port::StatusOr<void*> GetRocrandDsoHandle() {
@ -214,8 +222,8 @@ port::StatusOr<void*> GetMiopenDsoHandle() {
  return *result;
 }

-port::StatusOr<void*> GetRocfftDsoHandle() {
-  static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
+port::StatusOr<void*> GetHipfftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetHipfftDsoHandle());
  return *result;
 }

--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@ -49,7 +49,7 @@ port::StatusOr<void*> GetNvInferPluginDsoHandle();

 port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
-port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetHipfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
 port::StatusOr<void*> GetRoctracerDsoHandle();
 port::StatusOr<void*> GetHipsparseDsoHandle();
@ -84,7 +84,7 @@ port::StatusOr<void*> GetCudnnDsoHandle();

 port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
-port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetHipfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
 port::StatusOr<void*> GetRoctracerDsoHandle();
 port::StatusOr<void*> GetHipsparseDsoHandle();
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@ -197,19 +197,19 @@ cc_library(
 )

 cc_library(
-    name = "rocfft_if_static",
+    name = "hipfft_if_static",
    deps = if_static([
-        "@local_config_rocm//rocm:rocfft",
+        "@local_config_rocm//rocm:hipfft",
    ]),
 )

 cc_library(
-    name = "rocfft_plugin",
+    name = "hipfft_plugin",
    srcs = if_rocm_is_configured(["rocm_fft.cc"]),
    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
    visibility = ["//visibility:public"],
    deps = if_rocm_is_configured([
-        ":rocfft_if_static",
+        ":hipfft_if_static",
        ":rocm_platform_id",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:fft",
@ -356,7 +356,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = if_rocm_is_configured([
        ":miopen_plugin",
-        ":rocfft_plugin",
+        ":hipfft_plugin",
        ":rocblas_plugin",
        ":rocrand_plugin",
        ":rocm_driver",
--- a/tensorflow/stream_executor/rocm/rocm_fft.cc
+++ b/tensorflow/stream_executor/rocm/rocm_fft.cc
@ -61,7 +61,7 @@ namespace wrap {
    static const char *kName;                                             \
    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
    static void *GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetRocfftDsoHandle();           \
+      auto s = internal::CachedDsoLoader::GetHipfftDsoHandle();           \
      return s.ValueOrDie();                                              \
    }                                                                     \
    static FuncPtrT LoadOrDie() {                                         \
--- a/tensorflow/stream_executor/rocm/rocm_fft.h
+++ b/tensorflow/stream_executor/rocm/rocm_fft.h
@ -20,7 +20,18 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_

+#if TENSORFLOW_USE_ROCM
+
+#include "rocm/rocm_config.h"
+
+#if TF_ROCM_VERSION < 40100
 #include "rocm/include/rocfft/hipfft.h"
+#else
+#include "rocm/include/hipfft/hipfft.h"
+#endif
+
+#endif
+
 #include "tensorflow/stream_executor/fft.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@ -3,10 +3,10 @@
 FROM ubuntu:bionic
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>

-ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/4.0.1/
+ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/4.1/
 ARG ROCM_BUILD_NAME=xenial
 ARG ROCM_BUILD_NUM=main
-ARG ROCM_PATH=/opt/rocm-4.0.1
+ARG ROCM_PATH=/opt/rocm-4.1.0

 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_multi.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_multi.sh
@ -18,13 +18,14 @@ set -e
 set -x

 N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+N_TEST_JOBS=1 # run tests serially

 echo ""
 echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""

 # First positional argument (if any) specifies the ROCM_INSTALL_DIR
-ROCM_INSTALL_DIR=/opt/rocm-4.0.1
+ROCM_INSTALL_DIR=/opt/rocm-4.1.0
 if [[ -n $1 ]]; then
    ROCM_INSTALL_DIR=$1
 fi
@ -41,13 +42,100 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test \
      --config=rocm \
      -k \
-      --test_tag_filters=multi_gpu \
+      --test_tag_filters=-no_gpu,-no_rocm \
      --jobs=${N_BUILD_JOBS} \
-      --local_test_jobs=1 \
+      --local_test_jobs=${N_TEST_JOBS} \
      --test_timeout 600,900,2400,7200 \
      --build_tests_only \
      --test_output=errors \
      --test_sharding_strategy=disabled \
      --test_size_filters=small,medium,large \
+      --cache_test_results=no \
+      --test_env=TF_PER_DEVICE_MEMORY_LIMIT_MB=2048 \
      -- \
-      //tensorflow/core/nccl:nccl_manager_test
+//tensorflow/core/common_runtime/gpu:gpu_device_unified_memory_test_2gpu \
+//tensorflow/core/kernels:collective_nccl_test_2gpu \
+//tensorflow/core/nccl:nccl_manager_test_2gpu \
+//tensorflow/python/distribute/integration_test:mwms_peer_failure_test_2gpu \
+//tensorflow/python/distribute:checkpoint_utils_test_2gpu \
+//tensorflow/python/distribute:checkpointing_test_2gpu \
+//tensorflow/python/distribute:collective_all_reduce_strategy_test_xla_2gpu \
+//tensorflow/python/distribute:custom_training_loop_gradient_test_2gpu \
+//tensorflow/python/distribute:custom_training_loop_input_test_2gpu \
+//tensorflow/python/distribute:distribute_utils_test_2gpu \
+//tensorflow/python/distribute:input_lib_test_2gpu \
+//tensorflow/python/distribute:input_lib_type_spec_test_2gpu \
+//tensorflow/python/distribute:metrics_v1_test_2gpu \
+//tensorflow/python/distribute:mirrored_variable_test_2gpu \
+//tensorflow/python/distribute:parameter_server_strategy_test_2gpu \
+//tensorflow/python/distribute:ps_values_test_2gpu \
+//tensorflow/python/distribute:random_generator_test_2gpu \
+//tensorflow/python/distribute:test_util_test_2gpu \
+//tensorflow/python/distribute:tf_function_test_2gpu \
+//tensorflow/python/distribute:vars_test_2gpu \
+//tensorflow/python/distribute:warm_starting_util_test_2gpu \
+//tensorflow/python/keras/distribute:collective_all_reduce_strategy_test_2gpu \
+//tensorflow/python/keras/distribute:collective_all_reduce_strategy_test_xla_2gpu \
+//tensorflow/python/keras/distribute:ctl_correctness_test_2gpu \
+//tensorflow/python/keras/distribute:custom_training_loop_optimizer_test_2gpu \
+//tensorflow/python/keras/distribute:keras_metrics_test_2gpu \
+//tensorflow/python/keras/distribute:keras_models_test_2gpu \
+//tensorflow/python/keras/distribute:keras_optimizer_v2_test_2gpu \
+//tensorflow/python/keras/distribute:keras_stateful_lstm_model_correctness_test_2gpu \
+//tensorflow/python/keras/distribute:mirrored_strategy_test_2gpu \
+//tensorflow/python/keras/distribute:mirrored_variable_test_2gpu \
+//tensorflow/python/keras/distribute:multi_worker_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:category_crossing_distribution_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:category_encoding_distribution_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:discretization_distribution_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:hashing_distribution_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:image_preprocessing_distribution_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:index_lookup_distribution_test_2gpu \
+//tensorflow/python/keras/layers/preprocessing:text_vectorization_distribution_test_2gpu \
+//tensorflow/python/keras/utils:multi_gpu_utils_test_2gpu \
+//tensorflow/python/keras/utils:multi_gpu_utils_test_xla_2gpu \
+//tensorflow/python/kernel_tests:dynamic_partition_op_test_2gpu \
+//tensorflow/python/training:saver_test_2gpu \
+
+
+
+
+# no_rocm : //tensorflow/python/keras/distribute:keras_dnn_correctness_test_2gpu \
+# no_rocm : //tensorflow/python/keras/distribute:keras_embedding_model_correctness_test_2gpu \
+      
+# TIMEOUT : //tensorflow/python/distribute:values_test_2gpu \
+# TIMEOUT : //tensorflow/python/keras/distribute:keras_image_model_correctness_test_2gpu \
+# TIMEOUT : //tensorflow/python/keras/distribute:keras_rnn_model_correctness_test_2gpu \
+# TIMEOUT : //tensorflow/python/keras/distribute:saved_model_mixed_api_test_2gpu \
+# TIMEOUT : //tensorflow/python/keras/distribute:saved_model_save_load_test_2gpu \
+
+# Started timing-out with ROCm 4.1
+# TIMEOUT : //tensorflow/python/keras/distribute:keras_premade_models_test_2gpu \
+
+# Became FLAKY with  ROCm 4.1
+# FLAKY : //tensorflow/python/distribute:strategy_common_test_2gpu \
+# FLAKY : //tensorflow/python/distribute:strategy_common_test_xla_2gpu \
+# FLAKY : //tensorflow/python/distribute:strategy_gather_test_2gpu \
+# FLAKY : //tensorflow/python/distribute:strategy_gather_test_xla_2gpu \
+# FLAKY : //tensorflow/python/keras/distribute:custom_training_loop_metrics_test_2gpu \
+# FLAKY : //tensorflow/python/keras/distribute:custom_training_loop_models_test_2gpu \
+
+# FAILED : //tensorflow/python/distribute/v1:cross_device_ops_test_2gpu \
+# FAILED : //tensorflow/python/distribute:cross_device_ops_test_2gpu \
+# FAILED : //tensorflow/python/distribute:mirrored_strategy_test_2gpu \
+# FAILED : //tensorflow/python/keras/distribute:distribute_strategy_test_2gpu \
+# FAILED : //tensorflow/python/kernel_tests:collective_ops_test_2gpu \
+# FAILED : //tensorflow/python:collective_ops_gpu_test_2gpu \
+# FAILED : //tensorflow/python:nccl_ops_test_2gpu \
+
+# FAILED ON CI Node only : //tensorflow/python/distribute:collective_all_reduce_strategy_test_2gpu \
+# See run : http://ml-ci.amd.com:21096/job/tensorflow/job/github-prs-rocmfork-develop-upstream/job/rocm-latest-ubuntu-gpu-multi/216/console
+
+# FAILED ON CI Node only : //tensorflow/python/keras/distribute:keras_save_load_test_2gpu \
+# Starting with ROCm 4.1, see run : http://ml-ci.amd.com:21096/job/tensorflow/job/github-prs-rocmfork-develop-upstream/job/rocm-latest-ubuntu-gpu-multi/241/console
+
+# FAILED  //tensorflow/python/keras/distribute:minimize_loss_test_2gpu \
+# potential breaking commit : https://github.com/tensorflow/tensorflow/commit/74e39c8fa60079862597c9db506cd15b2443a5a2
+
+# NO MORE MULTI_GPU : //tensorflow/python/keras/distribute:checkpointing_test_2gpu \
+# multi_gpu tag was commented out in this commit : https://github.com/tensorflow/tensorflow/commit/b87d02a3f8d8b55045bf4250dd72e746357a3eba
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
@ -27,7 +27,7 @@ echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS}
 echo ""

 # First positional argument (if any) specifies the ROCM_INSTALL_DIR
-ROCM_INSTALL_DIR=/opt/rocm-4.0.1
+ROCM_INSTALL_DIR=/opt/rocm-4.1.0
 if [[ -n $1 ]]; then
    ROCM_INSTALL_DIR=$1
 fi
--- a/third_party/gpus/find_rocm_config.py
+++ b/third_party/gpus/find_rocm_config.py
@ -206,6 +206,28 @@ def _find_rocfft_config(rocm_install_path):
  return rocfft_config


+def _find_hipfft_config(rocm_install_path):
+
+  def hipfft_version_numbers(path):
+    version_file = os.path.join(path, "hipfft/include/hipfft-version.h")
+    if not os.path.exists(version_file):
+      raise ConfigError(
+          'hipfft version file "{}" not found'.format(version_file))
+    major = _get_header_version(version_file, "hipfftVersionMajor")
+    minor = _get_header_version(version_file, "hipfftVersionMinor")
+    patch = _get_header_version(version_file, "hipfftVersionPatch")
+    return major, minor, patch
+
+  major, minor, patch = hipfft_version_numbers(rocm_install_path)
+
+  hipfft_config = {
+      "hipfft_version_number":
+          _get_composite_version_number(major, minor, patch)
+  }
+
+  return hipfft_config
+
+
 def _find_roctracer_config(rocm_install_path):

  def roctracer_version_numbers(path):
@ -289,6 +311,8 @@ def find_rocm_config():
  result.update(_find_rocblas_config(rocm_install_path))
  result.update(_find_rocrand_config(rocm_install_path))
  result.update(_find_rocfft_config(rocm_install_path))
+  if result["rocm_version_number"] >= 40100:
+    result.update(_find_hipfft_config(rocm_install_path))
  result.update(_find_roctracer_config(rocm_install_path))
  result.update(_find_hipsparse_config(rocm_install_path))
  result.update(_find_rocsolver_config(rocm_install_path))
--- a/third_party/gpus/find_rocm_config.py.gz.base64
+++ b/third_party/gpus/find_rocm_config.py.gz.base64
@ -1 +1 @@
-eJy9Wm1v2zgS/q5fQSgoKm8cJe19uEUO+eBNs6j32iSws10smsCgbdrmRhZ1JJU0KPrfb4akZEqWEid2GqCoJQ0fDmeeeaGoPXIqsgfJ5wtN3h+9PyJXC0auWKqE/D0R96SX64WQKia9JCEDFFNkwBSTd2waB3vBHvnEJyDOpiRPp0wSDeN7GZ3Af+5Jl3xhUnGRkvfxEYlQIHSPws5/AOFB5GRJH0gqNMkVAwiuyIwnjLBvE5ZpwlMyEcss4TSdMHLP9cJM40BADfK3gxBjTUGagnwGVzNfjlBtFMa/hdbZ8eHh/f19TI2ysZDzw8QKqsNP/dOz8+HZAShshvyZJkwpItn/ci5hqeMHQjPQZ0LHoGVC74mQhM4lg2daoL73kmuezrtEiZm+p5IBypQrLfk41xVjFdrBmn0BMBdNSdgbkv4wJL/1hv1hFzD+6l99vPjzivzVGwx651f9syG5GJDTi/MP/av+xTlc/U5653+T//bPP3QJA1PBNOxbJlF/UJKjGY3ryJCxigIzYRVSGZvwGZ/AutJ5TueMzMUdkyksh2RMLrlCZypQbwooCV9yTbW5s7YonOZkp39BGIaXkqdIw4vTJUw/llQ+oDJkwSjOPwUXTbSQnBkdyZ1lH1BKgIJoWLPKB6XZMg4CJLyaSA48U4xK4IIypmiDR2KqKkoXPI5W0yqAm0ukwJRpNFVqTMxloYQByqz+OH4i0hmf59IYEMcpPRW5jo1WGUWiiwIcGeJ8gzRbSJHPF0gSlt5xKdIlSzW5o5IbUkag/+fRZe/qYycO+jMILniW8GltSu7M0rXLsXYoFDTqMCmNqyXTuTRuJ3ALDDQRU1a1n6a3zK6r8MGDpzEEDT4q9WrUO/bxEiFurTOs7a0/C59YR5hoX1A5PUB9puBDDXEfqHzs82AmxZKMqXJGdYlhpVupb0zAVisVwTyQlYJS0JgJwvJQZPpQiskyRJEc0x8FXTT4fUbzBNeT5CxAtgYBxJyQ4D5R/BKq+AV5wf0CJgVBMEkoxOmpcdEZWjk6MykQXNU5Dghor1AMZiGjOdMjN90IVRnh0iIjZn3lq+kPMsJAKk2TxBsEun4oWGstXbjcuW1J3CBLHRwZ4wJhvjoiOWnVD8T5jISljUN0oVCxYwNq0gy4kvnqjb4B+b02eaOiZDQxU68JdVamWnvmGwyLj1Bcs5EL41GaL8dMRkv6j5BdAhbD/2DYZOHb/90R/JFfiBEj+3iNVygNV0bcn8aSvJgjQi26JKVLVrhn4PwB2TwDhSGlAxAMNw4DK576Ocq5BqMnwTwEz7mIRcYK5FCGUCFSCBvI7CdhrmcHv4Yda/8l6gY2lCw2PyMZ7tmJyBtF9qPr6X4nJG+Mdl2D3zHjwLNG3qIQGwaAY27Gc0hZWfSu4x46K0GuiYxcJ0DTUQ5R6kfA22Lm8PsPE262ql2nb2NYHCBHxkZk38FW/0Ji+gosgJhnwAqms/j+A+rmdRoWEIYO7RAc0qFdi80ImAVBbzZnaF0IGpqEncKXoOzUMt7m2gbiHeNSUdg8qrJKRU7EGNA9MkqvOP2P4KUbY57OxKETPJiyu7D0BWpaDGHfoLtQkQ/YKdzUYHQT70XVMrMb8yOiKRil8SuAnYrSbj2g99cb88C0boaClVGYO2clZdwTbIWg4zghM4zgadSJ8U4WdWpyq0mqI2MFnY6OwtiZwwbhiaVbdezXoxsnY0KzWeadk8lcaDTJvL8pZj0IOyWq43lDskAWNNzGwGviRUMCK5KvZRoM/O7MEzYghMcvyGeA9yOoJUo7W4XvC57JPNV8yTZgvSdc0+B51AecQ55OknzKDvE3/CsA48UuwsDLB28/9i/JwGr90sDYg+0VJBAvS0PrYluqb7idgAaiSHaYt2HG0ZezwRAa+y65XwhQzvYVFguQIk9k9Ln3B+wFfjFlZp9UnvTPLwaFClBFJON3tvkvSpNlvi36BxOaTHKo8qAdk5DfFTf7LmjrhGmHLM49hXwKdMJIEAZsnCssRQr7q4zC6iF+oa+DmMcdiKqEYVPF8y0G3l1bW1iN0udCoBHCSki+q5Rng4sUXeOyH1et3IXoaud1Y+/hx9XanJXoWnLMmhtElhPcrqJYkDKy3OWrBdbn/gXgvzSmnkEomOjy7HxbTtVRPFoV2fv5KNDLnn4MX14vWvzeXDEqbPK53YgSHnvO2rp+VOaud0xj2P5s1jQZyUdYDvopDvvJkW9204mUiwkdTMlzd32wInrX5EvshP4V/7vYu0OOaIXAnlCmNHkC61eDcNMUj+citfkdC8AM+9XmpdTbJbxZ3/l4MT0r2iaI2kciduSZsEG5NdFScAw92q2fFnzRjdJAYctqHihTAFoCckKRCGqNerONnp8hwD+/feoNt00RazAvyxF1mG2TRFvctPaVXjzWWssGnN3miers9UQhIRY3SxRGcqsm04H4acIH3X09bAyEl+6+NmbaoHf+oWBalWNVQEcNzwM1ajRYHFqjFlc82RdVp6rzYDbTm9EABbdriiyIzwK4PHhNEgD+T2iKasYxI5+f8eooOPL5Ca+GYkZul++a/N6a7lZsqlF6HWXnyW41d53jWtIJmGwjmjvZrZlucXyy2zuvwnML/ROoDlnuatA7PRvsoL7XgbwKv7fyRHkmJZh9b7igsPGmjqHFil1i9YPlaBvSt7CglfcVhtWo34i1c/ZXNKi/V1L4LmHD10pOdrsAKHH8l0v2zusl/HKKnxAI5VzuZP7zy5L+GszLsn4d5nLLtN9Og+YIqFOs9ranEWu3EVDXoF4ClEjuNi0BTnbrEmBx/BJg77xqy2On+DmlYHjx6ctOSkEd6MWbvRrQDrZ7LWRorQQVotUqQSPWzitBRQMXB2sHavUDUUqmfIKH0vhdgJjZ1yxGkZThlwxuPXhQ9tSJdcPheNDG8aZAfILmb4flNwerrwwMw8sWwYCXNF+fw/qKKfzMADz0Y3X51R76aCGSW66NdHhTHCZVDreLEXGeTalm0SZnlp2WUZuc/LSNfeq9dtu4J98VPjLw8XcHjwx8dLP5yLinGvhHDPtU6/PIrE/VjI4fdwbCRduSQh0wTNby4bh8F3nLHrrFGXhKlJCaTaP1yIwh8pcq6pQ533xMFIVv1DF5o/CbgWiFZPR3H/l54YLnwe6loXpQsf0IKcZP6lgUXqdng8HF4BiYf516J/hKywgAO+UwiCONHxsEAUTvaIQfCYxG5OSEhKMRrnE0MsnLLjf4P/9j3RI=
+eJy9Wn9v2zgS/V+fglBQVN44StJbYBc55ABvmkV91yaBne1i0QYGbdM2t7KoJamkQdHvfjMkJVOylDix0wBFLWn4OJx58zj6sUfORHYv+XyhyZujN0fkesHINUuVkL8n4o70cr0QUsWklyRkgGaKDJhi8pZN42Av2CPv+QTM2ZTk6ZRJomF8L6MT+M9d6ZKPTCouUvImPiIRGoTuUtj5NyDci5ws6T1JhSa5YgDBFZnxhBH2dcIyTXhKJmKZJZymE0buuF6YaRwIuEH+chBirClYU7DP4Gjm2xGqjcP4t9A6Ozk8vLu7i6lxNhZyfphYQ3X4vn92fjE8PwCHzZA/0oQpRST7J+cSljq+JzQDfyZ0DF4m9I4ISehcMrimBfp7J7nm6bxLlJjpOyoZoEy50pKPc10JVuEdrNk3gHDRlIS9IekPQ/Jbb9gfdgHjz/71u8s/rsmfvcGgd3HdPx+SywE5u7x427/uX17A0e+kd/EX+V//4m2XMAgVTMO+ZhL9Byc5htGkjgwZqzgwE9YhlbEJn/EJrCud53TOyFzcMpnCckjG5JIrTKYC96aAkvAl11SbM2uLwmlOd/oXhGF4JXmKNLw8W8L0Y0nlPTpDFozi/FNI0UQLyZnxkdxa9gGlBDiIgTWrvFeaLeMgQMKrieTAM8WoBC4oE4o2eCSmqqJ0IeMYNa0COLlECkyZxlClJsRcFk4YoMz6j+MnIp3xeS5NAHGc0lOR69h4lVEkuijAkSEuN0izhRT5fIEkYektlyJdslSTWyq5IWUE/n8YXfWu33XioD+D4oJrCZ/WpuQuLF27HBuHwkHjDpPSpFoynUuTdgKnIEATMWXV+Gn6hdl1FTm49zyGosFLpV+Nfsc+XiLEF5sMG3ubzyInNhGm2hdUTg/QnynkUEPdByof+zyYSbEkY6pcUJ0wrHwr/Y0JxGrlIoQHVCkoDU2YoCwPRaYPpZgsQzTJUf4o+KIh7zOaJ7ieJGcBsjUIoOaEhPSJ4pdQxS/QBfcLmBQEwSShUKdnJkXnGOXo3EggpKpzEhDwXqEZzEJGc6ZHbroRujLCpUXGzObKd9MfZIyBVJomiTcIfH1bsNZGuki5S9uSuEGWOjgyxgXCfHVEctrqH5jzGQnLGIeYQqFixwb0pBlwZfPJG30D9ntt9sZFyWhipl4z6qxCtXbNDxhuPkJxzUaujEdpvhwzGS3p30J2CUQM/4Nhk4Uf/+Mj+CM/EWNG9vEYj9Aajoy5P40leTFHhF50SUqXrEjPwOUD1DwDh0HSAQiGm4RBFM98jXKpwepJUIfgOhexyFiBHMoQdogUygaU/TTM9ezg17Bj479E3yCGksXmZyTDPTsReaXIfvR5ut8JySvjXdfgd8w4yKyxtyjElgHgmJPxHCQri4477qKLEmhNZOw6AYaOcqhSvwJeFzOH376bcrO72uf0dQyLA+TIxIjsO9jqX0hMX4EbIOoMRMF0Ft++w775OQ0LCEOHdggOcmjXYhUBVRD8ZnOG0YWioUnYKXIJzk4t463WNhDvBJeKxuZSlVUqciYmgO6ScXrF6b8FL9MY83QmDp3hwZTdhmUu0NNiCPsK3YWKfMBOkaaGoJt6L3YtM7sJPyKaDaMMfgWwU3HarQf8/nRjLpjWzVCwMgq1c1ZSxl3BVgg6jlMywwqeRp0Yz2RRp2a3mqQ6MlbQ6egojF04bBGeWrpVx346unE2pjSbbY6dTeZKo8nmzU0x60HYKVEdzxvEAlnQcBoLr4kXDQJWiK9lGgz85sITNiCEJ8/QM8D7HtSE0s5W4fuCZzJPNV+yDVjvGdc8eBr1AeeQp5Mkn7JD/A3/CsB4sYsy8PTg9bv+FRlYr59bGHtwewUC4qk0tC62pfqKtxPQQBRih7oNM44+ng+G0Nh3yd1CgHO2r7BYgBR5JqMPvf/CvcBPZpvZJ5Ur/YvLQeEC7CKS8Vvb/Bdbk2W+3fQPJjSZ5LDLg3dMgr4rbu67oK0Tph2yOHcU9BTohJUgDNg4V7gVKeyvMgqrh/qFvg5qHu9AVKUMm3Y8P2KQ3bW1hdUqfSoEBiGslORxZXs2uEjRNS77ddXKXaiudl439h5+Xa3NWamuJUfV3KCynOF2O4oFKSvLHb5YYX3oXwL+c2vqCYSCia7OL7blVB3Fo1Wh3k9HgV727F34/P2iJe/NO0aFTT63G1HCEy9ZW+8flbnrHdMYbn82a5qM5QMsB/8Uh/vJkR9204mUiwkdTMlzd3ywInrX6CV2Qv+Kfynu3UEjWiGwJ5QpTR7B+tUg3DTV44VIrb7jBjDDfrV5KfV2CU/W73y8mp4VbRNU7QMVO/JC2ODcmmlpOIYe7YsvC77pRjJQxLKqA6UEYCRAEwohqDXqzTF6ukJAfn573xtuKxFrMM/TiDrMtiLRVjetfaVXj7XWsgFntzpRnb0uFBJqcTOhMJZbNZkOxJcJH3T3+2FjITz37mtjpg16F28LplU5VgV01PAyUKNGQ8ShNWpJxaN9UXWqOg9mM70ZDdBwu6bIgvgsgMODlyQB4P+ApqgWHDPy6YpXR8GRTxe8GooZuZ3eNeW9Ve5WbKpReh1l52K3mrt+W70Zx53hdhy3IP4t9Yty3OL/AI7bidxryA/PY3gV43n8rmBcbcnulow3s7vCo9rt7EuzuzJ3XcG1pBMI2EYi7my31nGL40u5PfMiKm6hfwDJYQ+/HvTOzgc76F7rQF7/urfKRPnGVTD7VHxBbxmhjqHFil3b4JfK0TaS3sKCVlWvMKwm7I1YO9f2igd1eVf4pGzDh6bOdmuRtzi+ztszLyr1doofo/Z2rq0FvwrzbM2vwOxA9lto0Kr8FYrVxL8Ra+f6X/GgvgUokdxuugU42623AIvjbwH2zIs29HaKH7MVDC/ff9zJVlAHevajjBrQDh5mtJChdSeoEK22EzRi7XwnqHjg6mDtdXH9dT8lUz7BTy7wqxcxsw8RjSMpw+903HrwNfBj32M0fPoRtHG8qRAfofnrYflFzeobGsPwskUw4CXN1+ewuWIKP6KBDH1fHX6yrzS1EMkXro11eFO8Kq18ulGMiPNsSjWLNnkj32kZtcl7zbaxj721aRv36JPwBwY+/GTsgYEP3mY6llQSUSuVG/KfU/Lz0fHRkaNJczAfnabFvcfuEx7I32Md1gOzPrY1dfzyNhCuqJcUthtTMFren5QP9L+w+27xIUlKlJCaTaN1AYhBYJYq6pRbi/kiLwpfqRPySuGHN9EKyfjvvpT1qhI/qnBP3tW9iu2XfDF+l8qi8HN6PhhcDk6gwD6n3mcwSssIADvlMChXjV/sBAGkfzTCL21GI3J6SsLRCNc4GhmNtMsN/g935/24
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@ -51,9 +51,9 @@ cc_library(
 )

 cc_library(
-    name = "rocfft",
-    srcs = ["rocm/lib/%{rocfft_lib}"],
-    data = ["rocm/lib/%{rocfft_lib}"],
+    name = "%{hipfft_or_rocfft}",
+    srcs = ["rocm/lib/%{hipfft_or_rocfft_lib}"],
+    data = ["rocm/lib/%{hipfft_or_rocfft_lib}"],
    includes = [
        ".",
        "rocm/include",
@ -106,7 +106,7 @@ cc_library(
        ":rocm_headers",
        ":hip",
        ":rocblas",
-        ":rocfft",
+        ":%{hipfft_or_rocfft}",
        ":hiprand",
        ":miopen",
        ":hipsparse",
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@ -187,6 +187,7 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include")
    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include")
    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/12.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/13.0.0/include")

    # Support hcc based off clang 10.0.0 (for ROCm 3.3)
    inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
@ -310,7 +311,7 @@ def _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin):

    return libs

-def _find_libs(repository_ctx, rocm_config, bash_bin):
+def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, bash_bin):
    """Returns the ROCm libraries on the system.

    Args:
@ -327,7 +328,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
        for name, path in [
            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
            ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
-            ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
+            (hipfft_or_rocfft, rocm_config.rocm_toolkit_path + "/" + hipfft_or_rocfft),
            ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
            ("MIOpen", rocm_config.rocm_toolkit_path + "/miopen"),
            ("rccl", rocm_config.rocm_toolkit_path + "/rccl"),
@ -456,7 +457,8 @@ def _create_dummy_repository(repository_ctx):
            "%{rocblas_lib}": _lib_name("rocblas"),
            "%{miopen_lib}": _lib_name("miopen"),
            "%{rccl_lib}": _lib_name("rccl"),
-            "%{rocfft_lib}": _lib_name("rocfft"),
+            "%{hipfft_or_rocfft}": "hipfft",
+            "%{hipfft_or_rocfft_lib}": _lib_name("hipfft"),
            "%{hiprand_lib}": _lib_name("hiprand"),
            "%{hipsparse_lib}": _lib_name("hipsparse"),
            "%{roctracer_lib}": _lib_name("roctracer64"),
@ -537,6 +539,10 @@ def _create_local_rocm_repository(repository_ctx):
    bash_bin = get_bash_bin(repository_ctx)
    rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)

+    # For ROCm 4.1 and above use hipfft, older ROCm versions use rocfft
+    rocm_version_number = int(rocm_config.rocm_version_number)
+    hipfft_or_rocfft = "rocfft" if rocm_version_number < 40100 else "hipfft"
+
    # Copy header and library files to execroot.
    # rocm_toolkit_path
    rocm_toolkit_path = rocm_config.rocm_toolkit_path
@ -550,9 +556,9 @@ def _create_local_rocm_repository(repository_ctx):
        ),
        make_copy_dir_rule(
            repository_ctx,
-            name = "rocfft-include",
-            src_dir = rocm_toolkit_path + "/rocfft/include",
-            out_dir = "rocm/include/rocfft",
+            name = hipfft_or_rocfft + "-include",
+            src_dir = rocm_toolkit_path + "/" + hipfft_or_rocfft + "/include",
+            out_dir = "rocm/include/" + hipfft_or_rocfft,
        ),
        make_copy_dir_rule(
            repository_ctx,
@ -586,7 +592,7 @@ def _create_local_rocm_repository(repository_ctx):
        ),
    ]

-    rocm_libs = _find_libs(repository_ctx, rocm_config, bash_bin)
+    rocm_libs = _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, bash_bin)
    rocm_lib_srcs = []
    rocm_lib_outs = []
    for lib in rocm_libs.values():
@ -632,7 +638,8 @@ def _create_local_rocm_repository(repository_ctx):
        {
            "%{hip_lib}": rocm_libs["amdhip64"].file_name,
            "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
-            "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
+            "%{hipfft_or_rocfft}": hipfft_or_rocfft,
+            "%{hipfft_or_rocfft_lib}": rocm_libs[hipfft_or_rocfft].file_name,
            "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
            "%{miopen_lib}": rocm_libs["MIOpen"].file_name,
            "%{rccl_lib}": rocm_libs["rccl"].file_name,
@ -641,7 +648,7 @@ def _create_local_rocm_repository(repository_ctx):
            "%{rocsolver_lib}": rocm_libs["rocsolver"].file_name,
            "%{copy_rules}": "\n".join(copy_rules),
            "%{rocm_headers}": ('":rocm-include",\n' +
-                                '":rocfft-include",\n' +
+                                '":' + hipfft_or_rocfft + '-include",\n' +
                                '":rocblas-include",\n' +
                                '":miopen-include",\n' +
                                '":rccl-include",\n' +