Merge pull request #48447 from ROCmSoftwarePlatform/google-upstream-rocm-amp-r2.5

[r2.5 port][ROCm] Port PR#47650 and PR#48441 to r2.5
This commit is contained in:
Mihai Maruseac 2021-04-22 15:29:34 -07:00 committed by GitHub
commit f09d694a57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 69 additions and 36 deletions

View File

@ -56,6 +56,50 @@ const char kCastToFp16[] = "CastToFp16";
const char kCastToBf16[] = "CastToBf16";
const char kCastToFp32[] = "CastToFp32";
// Returns the GPU architecture (compute capability) as a (major, minor) pair.
std::pair<int, int> GetDeviceGPUArch(
const DeviceProperties& device_properties) {
if (device_properties.type() != "GPU") return {0, 0};
string arch_str = device_properties.environment().at("architecture");
std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
if (split_arch_str.empty()) {
return {0, 0};
}
int major, minor;
if (!strings::safe_strto32(split_arch_str[0], &major)) {
return {0, 0};
}
if (split_arch_str.size() > 1) {
if (strings::safe_strto32(split_arch_str[1], &minor)) {
return {major, minor};
} else {
return {0, 0};
}
} else {
return {major, 0};
}
}
// Returns true if FP16Support is valid
// For CUDA, We compare the GPUArch with the kMinGPUArch, if GPUArch is >= min,
// return true. For AMD the corresponding gfx arch string for the detected AMD
// GPU is in the list for FP16 supported compute. Returns false otherwise.
bool HasFastFP16Support(const DeviceProperties& props) {
#if GOOGLE_CUDA
return GetDeviceGPUArch(props) >= kMinGPUArch;
#elif TENSORFLOW_USE_ROCM
absl::flat_hash_set<std::string> FP16SupportedDevices = {{"gfx906"},
{"gfx908"}};
std::string gcnArchName = props.environment().at("architecture");
std::vector<std::string> gpu_arch = absl::StrSplit(gcnArchName, ":");
return !gpu_arch.empty() && FP16SupportedDevices.contains(gpu_arch[0]);
#endif
return false;
}
// Instances of this class represent unique type attribute identifiers within a
// node. It handles regular type attributes, list type attributes (where
// type_index is set to the index in the type list), and fixed types.
@ -1133,34 +1177,8 @@ bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
return false;
}
// Returns the GPU architecture (compute capability) as a (major, minor) pair.
std::pair<int, int> GetDeviceGPUArch(
const DeviceProperties& device_properties) {
if (device_properties.type() != "GPU") return {0, 0};
string arch_str = device_properties.environment().at("architecture");
std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
if (split_arch_str.empty()) {
return {0, 0};
}
int major, minor;
if (!strings::safe_strto32(split_arch_str[0], &major)) {
return {0, 0};
}
if (split_arch_str.size() > 1) {
if (strings::safe_strto32(split_arch_str[1], &minor)) {
return {major, minor};
} else {
return {0, 0};
}
} else {
return {major, 0};
}
}
bool AutoMixedPrecisionImpl::IsOnSuitableGPUArch(const NodeDef& node) const {
return GetDeviceGPUArch(virtual_placer_.get_device(node)) >= kMinGPUArch;
return HasFastFP16Support(virtual_placer_.get_device(node));
}
bool AutoMixedPrecisionImpl::ShouldProcess(const NodeDef& node) const {
@ -1964,14 +1982,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
return Status::OK();
}
int GetNumGPUs(const Cluster& cluster,
const std::pair<int, int>& min_arch = {0, 0}) {
int GetNumGPUs(const Cluster& cluster) {
auto devices = cluster.GetDevices();
int num_gpus = 0;
for (const auto& device : devices) {
const DeviceProperties& device_properties = device.second;
std::pair<int, int> arch = GetDeviceGPUArch(device_properties);
if (device_properties.type() == "GPU" && arch >= min_arch) {
if (device_properties.type() == "GPU" &&
(ShouldIgnorePerformance() || HasFastFP16Support(device_properties))) {
num_gpus++;
}
}
@ -2001,8 +2018,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
// Start by copying input graph to output.
*output = item.graph;
int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
: GetNumGPUs(*cluster, kMinGPUArch);
int num_gpus = GetNumGPUs(*cluster);
if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
// AutoMixedPrecision is currently only tuned for GPU.
LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()

View File

@ -129,8 +129,12 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
"LSTMBlockCellGrad",
"MatMul",
};
#if TENSORFLOW_USE_ROCM
if (true) {
#else
if (cuda_version_ >= 9010) {
// Fp16 BatchMatMul is slow before CUDA 9.1.
#endif
list.insert("BatchMatMul");
list.insert("BatchMatMulV2");
}

View File

@ -85,10 +85,10 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
}
}
// Currently, this test suite only passes when TensorFlow passes with CUDA,
// Currently, this test suite only passes when TensorFlow passes with CUDA/HIP,
// because otherwise the optimizer will not turn clearlist nodes to float16.
// When looking at clearlist nodes, this optimizer checks if the nodes have a
// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
// float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
const std::pair<int, int> kMinGPUArch = {7, 0};
@ -102,6 +102,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
#if GOOGLE_CUDA
gpu_available_ =
gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
#else
gpu_available_ = false;
#endif
if (gpu_available_) {
virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
@ -111,6 +113,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
#if GOOGLE_CUDA
device_properties.mutable_environment()->insert({"architecture", "7"});
device_properties.mutable_environment()->insert({"cuda", "9010"});
#else
device_properties.mutable_environment()->insert({"architecture", "gfx906"});
#endif
virtual_cluster_.reset(
new VirtualCluster({{"/GPU:1", device_properties}}));
@ -1035,6 +1039,15 @@ int GetCudaVersion(const Cluster& cluster) {
return 0;
}
bool IsSupportedGPU(const Cluster& cluster) {
#ifdef GOOGLE_CUDA
return GetCudaVersion(cluster) >= 9010;
#else
return true;
#endif
}
TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
@ -1054,7 +1067,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
GraphView output_view(&output);
EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
if (GetCudaVersion(*virtual_cluster_.get()) >= 9010) {
if (IsSupportedGPU(*virtual_cluster_.get())) {
EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
} else {