mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
Merge pull request #48447 from ROCmSoftwarePlatform/google-upstream-rocm-amp-r2.5
[r2.5 port][ROCm] Port PR#47650 and PR#48441 to r2.5
This commit is contained in:
commit
f09d694a57
|
|
@ -56,6 +56,50 @@ const char kCastToFp16[] = "CastToFp16";
|
||||||
const char kCastToBf16[] = "CastToBf16";
|
const char kCastToBf16[] = "CastToBf16";
|
||||||
const char kCastToFp32[] = "CastToFp32";
|
const char kCastToFp32[] = "CastToFp32";
|
||||||
|
|
||||||
|
// Returns the GPU architecture (compute capability) as a (major, minor) pair.
|
||||||
|
std::pair<int, int> GetDeviceGPUArch(
|
||||||
|
const DeviceProperties& device_properties) {
|
||||||
|
if (device_properties.type() != "GPU") return {0, 0};
|
||||||
|
string arch_str = device_properties.environment().at("architecture");
|
||||||
|
std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
|
||||||
|
if (split_arch_str.empty()) {
|
||||||
|
return {0, 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
int major, minor;
|
||||||
|
if (!strings::safe_strto32(split_arch_str[0], &major)) {
|
||||||
|
return {0, 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (split_arch_str.size() > 1) {
|
||||||
|
if (strings::safe_strto32(split_arch_str[1], &minor)) {
|
||||||
|
return {major, minor};
|
||||||
|
} else {
|
||||||
|
return {0, 0};
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return {major, 0};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true if FP16Support is valid
|
||||||
|
// For CUDA, We compare the GPUArch with the kMinGPUArch, if GPUArch is >= min,
|
||||||
|
// return true. For AMD the corresponding gfx arch string for the detected AMD
|
||||||
|
// GPU is in the list for FP16 supported compute. Returns false otherwise.
|
||||||
|
|
||||||
|
bool HasFastFP16Support(const DeviceProperties& props) {
|
||||||
|
#if GOOGLE_CUDA
|
||||||
|
return GetDeviceGPUArch(props) >= kMinGPUArch;
|
||||||
|
#elif TENSORFLOW_USE_ROCM
|
||||||
|
absl::flat_hash_set<std::string> FP16SupportedDevices = {{"gfx906"},
|
||||||
|
{"gfx908"}};
|
||||||
|
std::string gcnArchName = props.environment().at("architecture");
|
||||||
|
std::vector<std::string> gpu_arch = absl::StrSplit(gcnArchName, ":");
|
||||||
|
return !gpu_arch.empty() && FP16SupportedDevices.contains(gpu_arch[0]);
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Instances of this class represent unique type attribute identifiers within a
|
// Instances of this class represent unique type attribute identifiers within a
|
||||||
// node. It handles regular type attributes, list type attributes (where
|
// node. It handles regular type attributes, list type attributes (where
|
||||||
// type_index is set to the index in the type list), and fixed types.
|
// type_index is set to the index in the type list), and fixed types.
|
||||||
|
|
@ -1133,34 +1177,8 @@ bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the GPU architecture (compute capability) as a (major, minor) pair.
|
|
||||||
std::pair<int, int> GetDeviceGPUArch(
|
|
||||||
const DeviceProperties& device_properties) {
|
|
||||||
if (device_properties.type() != "GPU") return {0, 0};
|
|
||||||
string arch_str = device_properties.environment().at("architecture");
|
|
||||||
std::vector<string> split_arch_str = str_util::Split(arch_str, '.');
|
|
||||||
if (split_arch_str.empty()) {
|
|
||||||
return {0, 0};
|
|
||||||
}
|
|
||||||
|
|
||||||
int major, minor;
|
|
||||||
if (!strings::safe_strto32(split_arch_str[0], &major)) {
|
|
||||||
return {0, 0};
|
|
||||||
}
|
|
||||||
|
|
||||||
if (split_arch_str.size() > 1) {
|
|
||||||
if (strings::safe_strto32(split_arch_str[1], &minor)) {
|
|
||||||
return {major, minor};
|
|
||||||
} else {
|
|
||||||
return {0, 0};
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return {major, 0};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool AutoMixedPrecisionImpl::IsOnSuitableGPUArch(const NodeDef& node) const {
|
bool AutoMixedPrecisionImpl::IsOnSuitableGPUArch(const NodeDef& node) const {
|
||||||
return GetDeviceGPUArch(virtual_placer_.get_device(node)) >= kMinGPUArch;
|
return HasFastFP16Support(virtual_placer_.get_device(node));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AutoMixedPrecisionImpl::ShouldProcess(const NodeDef& node) const {
|
bool AutoMixedPrecisionImpl::ShouldProcess(const NodeDef& node) const {
|
||||||
|
|
@ -1964,14 +1982,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
int GetNumGPUs(const Cluster& cluster,
|
int GetNumGPUs(const Cluster& cluster) {
|
||||||
const std::pair<int, int>& min_arch = {0, 0}) {
|
|
||||||
auto devices = cluster.GetDevices();
|
auto devices = cluster.GetDevices();
|
||||||
int num_gpus = 0;
|
int num_gpus = 0;
|
||||||
for (const auto& device : devices) {
|
for (const auto& device : devices) {
|
||||||
const DeviceProperties& device_properties = device.second;
|
const DeviceProperties& device_properties = device.second;
|
||||||
std::pair<int, int> arch = GetDeviceGPUArch(device_properties);
|
if (device_properties.type() == "GPU" &&
|
||||||
if (device_properties.type() == "GPU" && arch >= min_arch) {
|
(ShouldIgnorePerformance() || HasFastFP16Support(device_properties))) {
|
||||||
num_gpus++;
|
num_gpus++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2001,8 +2018,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
|
||||||
// Start by copying input graph to output.
|
// Start by copying input graph to output.
|
||||||
*output = item.graph;
|
*output = item.graph;
|
||||||
|
|
||||||
int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
|
int num_gpus = GetNumGPUs(*cluster);
|
||||||
: GetNumGPUs(*cluster, kMinGPUArch);
|
|
||||||
if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
|
if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
|
||||||
// AutoMixedPrecision is currently only tuned for GPU.
|
// AutoMixedPrecision is currently only tuned for GPU.
|
||||||
LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
|
LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
|
||||||
|
|
|
||||||
|
|
@ -129,8 +129,12 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
|
||||||
"LSTMBlockCellGrad",
|
"LSTMBlockCellGrad",
|
||||||
"MatMul",
|
"MatMul",
|
||||||
};
|
};
|
||||||
|
#if TENSORFLOW_USE_ROCM
|
||||||
|
if (true) {
|
||||||
|
#else
|
||||||
if (cuda_version_ >= 9010) {
|
if (cuda_version_ >= 9010) {
|
||||||
// Fp16 BatchMatMul is slow before CUDA 9.1.
|
// Fp16 BatchMatMul is slow before CUDA 9.1.
|
||||||
|
#endif
|
||||||
list.insert("BatchMatMul");
|
list.insert("BatchMatMul");
|
||||||
list.insert("BatchMatMulV2");
|
list.insert("BatchMatMulV2");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -85,10 +85,10 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Currently, this test suite only passes when TensorFlow passes with CUDA,
|
// Currently, this test suite only passes when TensorFlow passes with CUDA/HIP,
|
||||||
// because otherwise the optimizer will not turn clearlist nodes to float16.
|
// because otherwise the optimizer will not turn clearlist nodes to float16.
|
||||||
// When looking at clearlist nodes, this optimizer checks if the nodes have a
|
// When looking at clearlist nodes, this optimizer checks if the nodes have a
|
||||||
// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
|
// float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
|
||||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
const std::pair<int, int> kMinGPUArch = {7, 0};
|
const std::pair<int, int> kMinGPUArch = {7, 0};
|
||||||
|
|
@ -102,6 +102,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
gpu_available_ =
|
gpu_available_ =
|
||||||
gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
|
gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
|
||||||
|
#else
|
||||||
|
gpu_available_ = false;
|
||||||
#endif
|
#endif
|
||||||
if (gpu_available_) {
|
if (gpu_available_) {
|
||||||
virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
|
virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
|
||||||
|
|
@ -111,6 +113,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
device_properties.mutable_environment()->insert({"architecture", "7"});
|
device_properties.mutable_environment()->insert({"architecture", "7"});
|
||||||
device_properties.mutable_environment()->insert({"cuda", "9010"});
|
device_properties.mutable_environment()->insert({"cuda", "9010"});
|
||||||
|
#else
|
||||||
|
device_properties.mutable_environment()->insert({"architecture", "gfx906"});
|
||||||
#endif
|
#endif
|
||||||
virtual_cluster_.reset(
|
virtual_cluster_.reset(
|
||||||
new VirtualCluster({{"/GPU:1", device_properties}}));
|
new VirtualCluster({{"/GPU:1", device_properties}}));
|
||||||
|
|
@ -1035,6 +1039,15 @@ int GetCudaVersion(const Cluster& cluster) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsSupportedGPU(const Cluster& cluster) {
|
||||||
|
#ifdef GOOGLE_CUDA
|
||||||
|
return GetCudaVersion(cluster) >= 9010;
|
||||||
|
#else
|
||||||
|
return true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
|
TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
|
||||||
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
|
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
|
||||||
Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
|
Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
|
||||||
|
|
@ -1054,7 +1067,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
|
||||||
|
|
||||||
GraphView output_view(&output);
|
GraphView output_view(&output);
|
||||||
EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
|
EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
|
||||||
if (GetCudaVersion(*virtual_cluster_.get()) >= 9010) {
|
if (IsSupportedGPU(*virtual_cluster_.get())) {
|
||||||
EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
|
EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
|
||||||
EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
|
EXPECT_EQ(output_view.GetNode("allow1")->attr().at("T").type(), DT_HALF);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user