add pcie_bandwidth field to DeviceDescription

PiperOrigin-RevId: 824738638
2025-12-06 00:19:58 +01:00 · 2025-10-27 17:02:13 -07:00 · 2025-10-27 17:02:13 -07:00 · 699879f5f3
commit 699879f5f3
parent 769acdd784
5 changed files with 66 additions and 21 deletions
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@ -112,9 +112,6 @@ struct CudaBandwidthSettings {
  static constexpr double kSm80NvlinkBandwidth = 20.0;
  static constexpr double kSm90NvlinkBandwidth = 20.0;

-  // PCIE bandwidth for PCI Gen3 x16
-  static constexpr double kPciBandwidth = 12.0;
-
  // Discount factor for ring algorithm
  static constexpr double kRingAlgorithmDiscountFactor = 0.92;

@ -211,9 +208,6 @@ struct RocmBandwidthSettings {
  static constexpr double kMi200InfinityFabricBandwidth = 75.0;
  static constexpr double kMi300InfinityFabricBandwidth = 112.0;

-  // PCIe bandwidth for PCI Gen4 x16 (approximate)
-  static constexpr double kPciBandwidth = 32.0;
-
  // Discount factor for ring algorithm (based on ROCm NCCL implementation)
  static constexpr double kRingAlgorithmDiscountFactor = 0.90;

@ -321,10 +315,11 @@ absl::Duration ComputeAllreduceTimeImpl(
      std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
  int64_t num_channels =
      std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
-  int default_threads =
-      (bw_intra_node * num_channels <= bandwidth_settings.kPciBandwidth)
-          ? 256
-          : bandwidth_settings.kLL128NumThreads;
+  int64_t pcie_bandwidth_gbps =
+      gpu_device_info.pcie_bandwidth() / 1024 / 1024 / 1024;
+  int default_threads = (bw_intra_node * num_channels <= pcie_bandwidth_gbps)
+                            ? 256
+                            : bandwidth_settings.kLL128NumThreads;

  int warp_size = gpu_device_info.threads_per_warp();
  int num_threads =
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@ -47,6 +47,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
@ -690,6 +691,39 @@ absl::StatusOr<CUmulticastObjectProp> CreateMulticastObjectProperties(
  return multicast_properties;
 }

+absl::StatusOr<int64_t> GetDevicePcieBandwidth(int device_ordinal) {
+  nvmlDevice_t nvml_device;
+  nvmlReturn_t result =
+      nvmlDeviceGetHandleByIndex(device_ordinal, &nvml_device);
+  if (result != NVML_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("nvmlDeviceGetHandleByIndex failed with ", result));
+  }
+
+  // nvmlDeviceGetPcieSpeed returns wrong information. Verified with
+  // nvbandwidth.
+  unsigned int link_gen, link_width;
+  result = nvmlDeviceGetCurrPcieLinkGeneration(nvml_device, &link_gen);
+  if (result != NVML_SUCCESS) {
+    return absl::InternalError(absl::StrCat(
+        "nvmlDeviceGetCurrPcieLinkGeneration failed with ", result));
+  }
+
+  result = nvmlDeviceGetCurrPcieLinkWidth(nvml_device, &link_width);
+  if (result != NVML_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("nvmlDeviceGetCurrPcieLinkWidth failed with ", result));
+  }
+
+  // PCIe v1 single lane speed. 0.25 GB/s
+  int64_t lane_speed = 0.25 * 1024 * 1024 * 1024;
+  for (int i = 1; i < link_gen; i++) {
+    lane_speed *= 2;
+  }
+
+  return lane_speed * link_width;
+}
+
 }  // namespace

 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@ -1629,6 +1663,18 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
                              int64_t{mem_bus_width_bits.value()} / 8);
  }

+  {
+    absl::StatusOr<int64_t> status_or_bandwidth =
+        GetDevicePcieBandwidth(device_ordinal);
+    if (status_or_bandwidth.ok()) {
+      desc.set_pcie_bandwidth(*status_or_bandwidth);
+    } else {
+      LOG(ERROR) << status_or_bandwidth.status().message()
+                 << " Assuming PCIe gen 3 x16 bandwidth.";
+      status_or_bandwidth = 16LL * 1024 * 1024 * 1024;
+    }
+  }
+
  {
    BlockDim block_dim_limit;
    TF_RETURN_IF_ERROR(FillBlockDimLimit(device, &block_dim_limit));
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
@ -64,6 +64,7 @@ TEST(CudaExecutorTest, CreateDeviceDescription) {
  EXPECT_NE(result->driver_version(), kNullVersion);
  EXPECT_NE(result->compile_time_toolkit_version(), kNullVersion);

+  EXPECT_GT(result->pcie_bandwidth(), 1024 * 1024);
  EXPECT_THAT(result->platform_version(), Not(IsEmpty()));
  EXPECT_THAT(result->name(), Not(IsEmpty()));
  EXPECT_THAT(result->model_str(), Not(IsEmpty()));
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@ -154,34 +154,28 @@ class DeviceDescription {
  // Returns the limit on the total number of threads that can be launched in a
  // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
  // This limit affects what constitutes a legitimate kernel launch request.
-  const int64_t& threads_per_block_limit() const {
-    return threads_per_block_limit_;
-  }
+  int64_t threads_per_block_limit() const { return threads_per_block_limit_; }

  // Returns the limit on the total number of threads that can be simultaneously
  // launched on a given multiprocessor.
-  const int64_t& threads_per_core_limit() const {
-    return threads_per_core_limit_;
-  }
+  int64_t threads_per_core_limit() const { return threads_per_core_limit_; }

  // Returns the number of threads per warp/wavefront.
  constexpr int64_t threads_per_warp() const { return threads_per_warp_; }

  // Returns the limit on the total number of registers per core.
-  const int64_t& registers_per_core_limit() const {
-    return registers_per_core_limit_;
-  }
+  int64_t registers_per_core_limit() const { return registers_per_core_limit_; }

  // Returns the limit on the total number of registers that can be
  // simultaneously used by a block.
-  const int64_t& registers_per_block_limit() const {
+  int64_t registers_per_block_limit() const {
    return registers_per_block_limit_;
  }

  // Returns the number of address bits available to kernel code running on the
  // platform. This affects things like the maximum allocation size and perhaps
  // types used in kernel code such as size_t.
-  const int64_t& device_address_bits() const { return device_address_bits_; }
+  int64_t device_address_bits() const { return device_address_bits_; }

  // Returns the device memory size in bytes.
  int64_t device_memory_size() const { return device_memory_size_; }
@ -194,6 +188,9 @@ class DeviceDescription {
  // host and device.)
  int64_t memory_bandwidth() const { return memory_bandwidth_; }

+  // Returns the PCIe memory bandwidth in bytes/sec.
+  int64_t pcie_bandwidth() const { return pcie_bandwidth_; }
+
  // Returns the device's core clock rate in GHz.
  float clock_rate_ghz() const { return clock_rate_ghz_; }

@ -340,6 +337,7 @@ class DeviceDescription {
  void set_device_memory_size(int64_t value) { device_memory_size_ = value; }
  void set_l2_cache_size(int64_t value) { l2_cache_size_ = value; }
  void set_memory_bandwidth(int64_t value) { memory_bandwidth_ = value; }
+  void set_pcie_bandwidth(int64_t value) { pcie_bandwidth_ = value; }

  void set_shared_memory_per_core(int64_t value) {
    shared_memory_per_core_ = value;
@ -400,7 +398,9 @@ class DeviceDescription {
  int64_t device_address_bits_ = kUninitialized<int64_t>;
  int64_t device_memory_size_ = kUninitialized<int64_t>;
  int64_t l2_cache_size_ = kUninitialized<int64_t>;
+
  int64_t memory_bandwidth_ = kUninitialized<int64_t>;
+  int64_t pcie_bandwidth_ = kUninitialized<int64_t>;

  // Shared memory limits on a given device.
  int64_t shared_memory_per_core_ = kUninitialized<int64_t>;
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@ -1121,6 +1121,9 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
    desc.set_l2_cache_size(prop.l2CacheSize);
  }

+  // PCIe bandwidth for PCI Gen4 x16 (approximate)
+  desc.set_pcie_bandwidth(32LL * 1024 * 1024 * 1024);
+
  {
    auto ecc_enabled_or = IsEccEnabled(device);
    if (!ecc_enabled_or.ok()) {