mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
Merge pull request #48447 from ROCmSoftwarePlatform/google-upstream-rocm-amp-r2.5
[r2.5 port][ROCm] Port PR#47650 and PR#48441 to r2.5
This commit is contained in:
commit
f09d694a57
|
|
@ -56,6 +56,50 @@ const char kCastToFp16[] = "CastToFp16";
|
|||
const char kCastToBf16[] = "CastToBf16";
|
||||
const char kCastToFp32[] = "CastToFp32";
|
||||
|
||||
// Returns the GPU architecture (compute capability) as a (major, minor) pair.
|
||||
std::pair<int, int> GetDeviceGPUArch(
|
||||
const DeviceProperties& device_properties) {
|
||||
if (device_properties.type() != "GPU") return {0, 0};
|
||||
string arch_str = device_properties.environment().at("architecture");
|
||||
std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
|
||||
if (split_arch_str.empty()) {
|
||||
return {0, 0};
|
||||
}
|
||||
|
||||
int major, minor;
|
||||
if (!strings::safe_strto32(split_arch_str[0], &major)) {
|
||||
return {0, 0};
|
||||
}
|
||||
|
||||
if (split_arch_str.size() > 1) {
|
||||
if (strings::safe_strto32(split_arch_str[1], &minor)) {
|
||||
return {major, minor};
|
||||
} else {
|
||||
return {0, 0};
|
||||
}
|
||||
} else {
|
||||
return {major, 0};
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if FP16Support is valid
|
||||
// For CUDA, We compare the GPUArch with the kMinGPUArch, if GPUArch is >= min,
|
||||
// return true. For AMD the corresponding gfx arch string for the detected AMD
|
||||
// GPU is in the list for FP16 supported compute. Returns false otherwise.
|
||||
|
||||
bool HasFastFP16Support(const DeviceProperties& props) {
|
||||
#if GOOGLE_CUDA
|
||||
return GetDeviceGPUArch(props) >= kMinGPUArch;
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
absl::flat_hash_set<std::string> FP16SupportedDevices = {{"gfx906"},
|
||||
{"gfx908"}};
|
||||
std::string gcnArchName = props.environment().at("architecture");
|
||||
std::vector<std::string> gpu_arch = absl::StrSplit(gcnArchName, ":");
|
||||
return !gpu_arch.empty() && FP16SupportedDevices.contains(gpu_arch[0]);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
// Instances of this class represent unique type attribute identifiers within a
|
||||
// node. It handles regular type attributes, list type attributes (where
|
||||
// type_index is set to the index in the type list), and fixed types.
|
||||
|
|
@ -1133,34 +1177,8 @@ bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
|
|||
return false;
|
||||
}
|
||||
|
||||
// Returns the GPU architecture (compute capability) as a (major, minor) pair.
|
||||
std::pair<int, int> GetDeviceGPUArch(
|
||||
const DeviceProperties& device_properties) {
|
||||
if (device_properties.type() != "GPU") return {0, 0};
|
||||
string arch_str = device_properties.environment().at("architecture");
|
||||
std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
|
||||
if (split_arch_str.empty()) {
|
||||
return {0, 0};
|
||||
}
|
||||
|
||||
int major, minor;
|
||||
if (!strings::safe_strto32(split_arch_str[0], &major)) {
|
||||
return {0, 0};
|
||||
}
|
||||
|
||||
if (split_arch_str.size() > 1) {
|
||||
if (strings::safe_strto32(split_arch_str[1], &minor)) {
|
||||
return {major, minor};
|
||||
} else {
|
||||
return {0, 0};
|
||||
}
|
||||
} else {
|
||||
return {major, 0};
|
||||
}
|
||||
}
|
||||
|
||||
bool AutoMixedPrecisionImpl::IsOnSuitableGPUArch(const NodeDef& node) const {
|
||||
return GetDeviceGPUArch(virtual_placer_.get_device(node)) >= kMinGPUArch;
|
||||
return HasFastFP16Support(virtual_placer_.get_device(node));
|
||||
}
|
||||
|
||||
bool AutoMixedPrecisionImpl::ShouldProcess(const NodeDef& node) const {
|
||||
|
|
@ -1964,14 +1982,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
int GetNumGPUs(const Cluster& cluster,
|
||||
const std::pair<int, int>& min_arch = {0, 0}) {
|
||||
int GetNumGPUs(const Cluster& cluster) {
|
||||
auto devices = cluster.GetDevices();
|
||||
int num_gpus = 0;
|
||||
for (const auto& device : devices) {
|
||||
const DeviceProperties& device_properties = device.second;
|
||||
std::pair<int, int> arch = GetDeviceGPUArch(device_properties);
|
||||
if (device_properties.type() == "GPU" && arch >= min_arch) {
|
||||
if (device_properties.type() == "GPU" &&
|
||||
(ShouldIgnorePerformance() || HasFastFP16Support(device_properties))) {
|
||||
num_gpus++;
|
||||
}
|
||||
}
|
||||
|
|
@ -2001,8 +2018,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
|
|||
// Start by copying input graph to output.
|
||||
*output = item.graph;
|
||||
|
||||
int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
|
||||
: GetNumGPUs(*cluster, kMinGPUArch);
|
||||
int num_gpus = GetNumGPUs(*cluster);
|
||||
if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
|
||||
// AutoMixedPrecision is currently only tuned for GPU.
|
||||
LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
|
||||
|
|
|
|||
|
|
@ -129,8 +129,12 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
|
|||
"LSTMBlockCellGrad",
|
||||
"MatMul",
|
||||
};
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
if (true) {
|
||||
#else
|
||||
if (cuda_version_ >= 9010) {
|
||||
// Fp16 BatchMatMul is slow before CUDA 9.1.
|
||||
#endif
|
||||
list.insert("BatchMatMul");
|
||||
list.insert("BatchMatMulV2");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -85,10 +85,10 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
|
|||
}
|
||||
}
|
||||
|
||||
// Currently, this test suite only passes when TensorFlow passes with CUDA,
|
||||
// Currently, this test suite only passes when TensorFlow passes with CUDA/HIP,
|
||||
// because otherwise the optimizer will not turn clearlist nodes to float16.
|
||||
// When looking at clearlist nodes, this optimizer checks if the nodes have a
|
||||
// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
|
||||
// float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
const std::pair<int, int> kMinGPUArch = {7, 0};
|
||||
|
|
@ -102,6 +102,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
|
|||
#if GOOGLE_CUDA
|
||||
gpu_available_ =
|
||||
gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
|
||||
#else
|
||||
gpu_available_ = false;
|
||||
#endif
|
||||
if (gpu_available_) {
|
||||
virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
|
||||
|
|
@ -111,6 +113,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
|
|||
#if GOOGLE_CUDA
|
||||
device_properties.mutable_environment()->insert({"architecture", "7"});
|
||||
device_properties.mutable_environment()->insert({"cuda", "9010"});
|
||||
#else
|
||||
device_properties.mutable_environment()->insert({"architecture", "gfx906"});
|
||||
#endif
|
||||
virtual_cluster_.reset(
|
||||
new VirtualCluster({{"/GPU:1", device_properties}}));
|
||||
|
|
@ -1035,6 +1039,15 @@ int GetCudaVersion(const Cluster& cluster) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool IsSupportedGPU(const Cluster& cluster) {
|
||||
#ifdef GOOGLE_CUDA
|
||||
return GetCudaVersion(cluster) >= 9010;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
|
||||
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
|
||||
Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
|
||||
|
|
@ -1054,7 +1067,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
|
|||
|
||||
GraphView output_view(&output);
|
||||
EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
|
||||
if (GetCudaVersion(*virtual_cluster_.get()) >= 9010) {
|
||||
if (IsSupportedGPU(*virtual_cluster_.get())) {
|
||||
EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
|
||||
EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user