Merge pull request #48447 from ROCmSoftwarePlatform/google-upstream-rocm-amp-r2.5

[r2.5 port][ROCm] Port PR#47650 and PR#48441 to r2.5
2025-12-06 12:20:11 +01:00 · 2021-04-22 15:29:34 -07:00 · 2021-04-22 15:29:34 -07:00 · f09d694a57
commit f09d694a57
parent c87875ede5 ee5c664086
3 changed files with 69 additions and 36 deletions
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@ -56,6 +56,50 @@ const char kCastToFp16[] = "CastToFp16";
 const char kCastToBf16[] = "CastToBf16";
 const char kCastToFp32[] = "CastToFp32";

+// Returns the GPU architecture (compute capability) as a (major, minor) pair.
+std::pair<int, int> GetDeviceGPUArch(
+    const DeviceProperties& device_properties) {
+  if (device_properties.type() != "GPU") return {0, 0};
+  string arch_str = device_properties.environment().at("architecture");
+  std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
+  if (split_arch_str.empty()) {
+    return {0, 0};
+  }
+
+  int major, minor;
+  if (!strings::safe_strto32(split_arch_str[0], &major)) {
+    return {0, 0};
+  }
+
+  if (split_arch_str.size() > 1) {
+    if (strings::safe_strto32(split_arch_str[1], &minor)) {
+      return {major, minor};
+    } else {
+      return {0, 0};
+    }
+  } else {
+    return {major, 0};
+  }
+}
+
+// Returns true if FP16Support is valid
+// For CUDA, We compare the GPUArch with the kMinGPUArch, if GPUArch is >= min,
+// return true. For AMD the corresponding gfx arch string for the detected AMD
+// GPU is in the list for FP16 supported compute. Returns false otherwise.
+
+bool HasFastFP16Support(const DeviceProperties& props) {
+#if GOOGLE_CUDA
+  return GetDeviceGPUArch(props) >= kMinGPUArch;
+#elif TENSORFLOW_USE_ROCM
+  absl::flat_hash_set<std::string> FP16SupportedDevices = {{"gfx906"},
+                                                           {"gfx908"}};
+  std::string gcnArchName = props.environment().at("architecture");
+  std::vector<std::string> gpu_arch = absl::StrSplit(gcnArchName, ":");
+  return !gpu_arch.empty() && FP16SupportedDevices.contains(gpu_arch[0]);
+#endif
+  return false;
+}
+
 // Instances of this class represent unique type attribute identifiers within a
 // node. It handles regular type attributes, list type attributes (where
 // type_index is set to the index in the type list), and fixed types.
@ -1133,34 +1177,8 @@ bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
  return false;
 }

-// Returns the GPU architecture (compute capability) as a (major, minor) pair.
-std::pair<int, int> GetDeviceGPUArch(
-    const DeviceProperties& device_properties) {
-  if (device_properties.type() != "GPU") return {0, 0};
-  string arch_str = device_properties.environment().at("architecture");
-  std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
-  if (split_arch_str.empty()) {
-    return {0, 0};
-  }
-
-  int major, minor;
-  if (!strings::safe_strto32(split_arch_str[0], &major)) {
-    return {0, 0};
-  }
-
-  if (split_arch_str.size() > 1) {
-    if (strings::safe_strto32(split_arch_str[1], &minor)) {
-      return {major, minor};
-    } else {
-      return {0, 0};
-    }
-  } else {
-    return {major, 0};
-  }
-}
-
 bool AutoMixedPrecisionImpl::IsOnSuitableGPUArch(const NodeDef& node) const {
-  return GetDeviceGPUArch(virtual_placer_.get_device(node)) >= kMinGPUArch;
+  return HasFastFP16Support(virtual_placer_.get_device(node));
 }

 bool AutoMixedPrecisionImpl::ShouldProcess(const NodeDef& node) const {
@ -1964,14 +1982,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
  return Status::OK();
 }

-int GetNumGPUs(const Cluster& cluster,
-               const std::pair<int, int>& min_arch = {0, 0}) {
+int GetNumGPUs(const Cluster& cluster) {
  auto devices = cluster.GetDevices();
  int num_gpus = 0;
  for (const auto& device : devices) {
    const DeviceProperties& device_properties = device.second;
-    std::pair<int, int> arch = GetDeviceGPUArch(device_properties);
-    if (device_properties.type() == "GPU" && arch >= min_arch) {
+    if (device_properties.type() == "GPU" &&
+        (ShouldIgnorePerformance() || HasFastFP16Support(device_properties))) {
      num_gpus++;
    }
  }
@ -2001,8 +2018,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
  // Start by copying input graph to output.
  *output = item.graph;

-  int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
-                                           : GetNumGPUs(*cluster, kMinGPUArch);
+  int num_gpus = GetNumGPUs(*cluster);
  if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
    // AutoMixedPrecision is currently only tuned for GPU.
    LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@ -129,8 +129,12 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
        "LSTMBlockCellGrad",
        "MatMul",
    };
+#if TENSORFLOW_USE_ROCM
+    if (true) {
+#else
    if (cuda_version_ >= 9010) {
      // Fp16 BatchMatMul is slow before CUDA 9.1.
+#endif
      list.insert("BatchMatMul");
      list.insert("BatchMatMulV2");
    }
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@ -85,10 +85,10 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
  }
 }

-// Currently, this test suite only passes when TensorFlow passes with CUDA,
+// Currently, this test suite only passes when TensorFlow passes with CUDA/HIP,
 // because otherwise the optimizer will not turn clearlist nodes to float16.
 // When looking at clearlist nodes, this optimizer checks if the nodes have a
-// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
+// float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 const std::pair<int, int> kMinGPUArch = {7, 0};
@ -102,6 +102,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 #if GOOGLE_CUDA
    gpu_available_ =
        gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
+#else
+    gpu_available_ = false;
 #endif
    if (gpu_available_) {
      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
@ -111,6 +113,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 #if GOOGLE_CUDA
      device_properties.mutable_environment()->insert({"architecture", "7"});
      device_properties.mutable_environment()->insert({"cuda", "9010"});
+#else 
+      device_properties.mutable_environment()->insert({"architecture", "gfx906"});
 #endif
      virtual_cluster_.reset(
          new VirtualCluster({{"/GPU:1", device_properties}}));
@ -1035,6 +1039,15 @@ int GetCudaVersion(const Cluster& cluster) {
  return 0;
 }

+bool IsSupportedGPU(const Cluster& cluster) {
+#ifdef GOOGLE_CUDA
+    return GetCudaVersion(cluster) >= 9010;
+#else
+    return true;
+#endif
+}
+
+
 TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
  Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
@ -1054,7 +1067,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {

  GraphView output_view(&output);
  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
-  if (GetCudaVersion(*virtual_cluster_.get()) >= 9010) {
+  if (IsSupportedGPU(*virtual_cluster_.get())) {
    EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
    EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
  } else {