[XLA:GPU] Fix race condition in gpu_compiler.cc.

We were racing on libdevice_dir_. PiperOrigin-RevId: 174070334
2025-12-06 12:20:11 +01:00 · 2017-10-31 11:59:56 -07:00 · 2017-10-31 11:59:56 -07:00 · b5d5326c62
commit b5d5326c62
parent 35939d2d37
2 changed files with 37 additions and 20 deletions
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -94,15 +94,13 @@ using tensorflow::strings::StrCat;
 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
 constexpr int64 kMemoryAlignment = 256;

-// Returns the directory containing nvvm libdevice files. This function is
-// called in GpuCompiler's constructor, so can't return an error. But
-// GpuCompiler::Compile will return an error when the wanted libdevice file
-// doesn't exist in the folder this function returns.
-string GetLibdeviceDir(const HloModuleConfig& config) {
+// Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
+// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
+// HloModule being compiled.
+string GetLibdeviceDir(const string& config_cuda_data_dir) {
  std::vector<string> potential_libdevice_dirs;
-  const string datadir = config.debug_options().xla_gpu_cuda_data_dir();
-  if (!datadir.empty()) {
-    potential_libdevice_dirs.push_back(datadir);
+  if (!config_cuda_data_dir.empty()) {
+    potential_libdevice_dirs.push_back(config_cuda_data_dir);
  }
  potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot());

@ -359,12 +357,26 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
        /*optimized=*/false));
  }

-  // Reserve space for the PTX to be generated for this module.
  string* ptx;
+  string libdevice_dir;
  {
    tensorflow::mutex_lock lock(mutex_);
+
+    // Reserve space for the PTX to be generated for this module.
    generated_ptxes_.emplace_back(MakeUnique<string>());
    ptx = generated_ptxes_.back().get();
+
+    // Find the directory containing libdevice.  To avoid searching for it every
+    // time, we have a one-element cache, keyed on the module's config's
+    // cuda_data_dir.
+    const auto& config_cuda_data_dir =
+        module->config().debug_options().xla_gpu_cuda_data_dir();
+    if (cached_libdevice_dir_.empty() ||
+        cached_cuda_data_dir_ != config_cuda_data_dir) {
+      cached_cuda_data_dir_ = config_cuda_data_dir;
+      cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir);
+    }
+    libdevice_dir = cached_libdevice_dir_;
  }
  int cc_major, cc_minor;
  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
@ -374,12 +386,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
    cc_major = 2;
    cc_minor = 0;
  }
-  if (libdevice_dir_.empty()) {
-    // Compute libdevice_dir_ just once and cache it in this member.
-    libdevice_dir_ = GetLibdeviceDir(module->config());
-  }
+
  TF_ASSIGN_OR_RETURN(*ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
-                                         module->config(), libdevice_dir_));
+                                         module->config(), libdevice_dir));

  if (!ir_dump_directory.empty()) {
    TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@ -71,18 +71,26 @@ class GpuCompiler : public LLVMCompiler {
  static const char* kDataLayout;

 private:
-  // The parent directory of libdevice IR libraries.
-  string libdevice_dir_;
+  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
+  const int64 pointer_size_;
+
+  tensorflow::mutex mutex_;
+
+  // When compiling an HLO module, we need to find a path to the nvvm libdevice
+  // files.  We search in the module's config.debug_options().cuda_data_dir()
+  // and in tensorflow::LibdeviceRoot(), the latter of which is a constant.
+  //
+  // We cache the cuda_data_dir() and the result of our search, so that if the
+  // next module we have to compile has the same cuda_data_dir(), we can skip
+  // the search.
+  string cached_cuda_data_dir_ GUARDED_BY(mutex_);
+  string cached_libdevice_dir_ GUARDED_BY(mutex_);

  // The list of PTX strings generated by this GpuCompiler. We let GpuCompiler
  // to own them because they need to be alive across the life span of the
  // StreamExecutor (b/24776264).
-  tensorflow::mutex mutex_;
  std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);

-  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
-  int64 pointer_size_;
-
  TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler);
 };