add pcie_bandwidth field to DeviceDescription

PiperOrigin-RevId: 824738638
This commit is contained in:
Maxim Ermilov 2025-10-27 17:02:13 -07:00 committed by TensorFlower Gardener
parent 769acdd784
commit 699879f5f3
5 changed files with 66 additions and 21 deletions

View File

@ -112,9 +112,6 @@ struct CudaBandwidthSettings {
static constexpr double kSm80NvlinkBandwidth = 20.0;
static constexpr double kSm90NvlinkBandwidth = 20.0;
// PCIE bandwidth for PCI Gen3 x16
static constexpr double kPciBandwidth = 12.0;
// Discount factor for ring algorithm
static constexpr double kRingAlgorithmDiscountFactor = 0.92;
@ -211,9 +208,6 @@ struct RocmBandwidthSettings {
static constexpr double kMi200InfinityFabricBandwidth = 75.0;
static constexpr double kMi300InfinityFabricBandwidth = 112.0;
// PCIe bandwidth for PCI Gen4 x16 (approximate)
static constexpr double kPciBandwidth = 32.0;
// Discount factor for ring algorithm (based on ROCm NCCL implementation)
static constexpr double kRingAlgorithmDiscountFactor = 0.90;
@ -321,10 +315,11 @@ absl::Duration ComputeAllreduceTimeImpl(
std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
int64_t num_channels =
std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
int default_threads =
(bw_intra_node * num_channels <= bandwidth_settings.kPciBandwidth)
? 256
: bandwidth_settings.kLL128NumThreads;
int64_t pcie_bandwidth_gbps =
gpu_device_info.pcie_bandwidth() / 1024 / 1024 / 1024;
int default_threads = (bw_intra_node * num_channels <= pcie_bandwidth_gbps)
? 256
: bandwidth_settings.kLL128NumThreads;
int warp_size = gpu_device_info.threads_per_warp();
int num_threads =

View File

@ -47,6 +47,7 @@ limitations under the License.
#include "third_party/gpus/cuda/include/cuda.h"
#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
#include "third_party/gpus/cuda/include/driver_types.h"
#include "third_party/gpus/cuda/nvml/include/nvml.h"
#include "xla/backends/gpu/collectives/gpu_collectives.h"
#include "xla/core/collectives/collectives.h"
#include "xla/core/collectives/collectives_registry.h"
@ -690,6 +691,39 @@ absl::StatusOr<CUmulticastObjectProp> CreateMulticastObjectProperties(
return multicast_properties;
}
absl::StatusOr<int64_t> GetDevicePcieBandwidth(int device_ordinal) {
nvmlDevice_t nvml_device;
nvmlReturn_t result =
nvmlDeviceGetHandleByIndex(device_ordinal, &nvml_device);
if (result != NVML_SUCCESS) {
return absl::InternalError(
absl::StrCat("nvmlDeviceGetHandleByIndex failed with ", result));
}
// nvmlDeviceGetPcieSpeed returns wrong information. Verified with
// nvbandwidth.
unsigned int link_gen, link_width;
result = nvmlDeviceGetCurrPcieLinkGeneration(nvml_device, &link_gen);
if (result != NVML_SUCCESS) {
return absl::InternalError(absl::StrCat(
"nvmlDeviceGetCurrPcieLinkGeneration failed with ", result));
}
result = nvmlDeviceGetCurrPcieLinkWidth(nvml_device, &link_width);
if (result != NVML_SUCCESS) {
return absl::InternalError(
absl::StrCat("nvmlDeviceGetCurrPcieLinkWidth failed with ", result));
}
// PCIe v1 single lane speed. 0.25 GB/s
int64_t lane_speed = 0.25 * 1024 * 1024 * 1024;
for (int i = 1; i < link_gen; i++) {
lane_speed *= 2;
}
return lane_speed * link_width;
}
} // namespace
// Given const GPU memory, returns a libcuda device pointer datatype, suitable
@ -1629,6 +1663,18 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
int64_t{mem_bus_width_bits.value()} / 8);
}
{
absl::StatusOr<int64_t> status_or_bandwidth =
GetDevicePcieBandwidth(device_ordinal);
if (status_or_bandwidth.ok()) {
desc.set_pcie_bandwidth(*status_or_bandwidth);
} else {
LOG(ERROR) << status_or_bandwidth.status().message()
<< " Assuming PCIe gen 3 x16 bandwidth.";
status_or_bandwidth = 16LL * 1024 * 1024 * 1024;
}
}
{
BlockDim block_dim_limit;
TF_RETURN_IF_ERROR(FillBlockDimLimit(device, &block_dim_limit));

View File

@ -64,6 +64,7 @@ TEST(CudaExecutorTest, CreateDeviceDescription) {
EXPECT_NE(result->driver_version(), kNullVersion);
EXPECT_NE(result->compile_time_toolkit_version(), kNullVersion);
EXPECT_GT(result->pcie_bandwidth(), 1024 * 1024);
EXPECT_THAT(result->platform_version(), Not(IsEmpty()));
EXPECT_THAT(result->name(), Not(IsEmpty()));
EXPECT_THAT(result->model_str(), Not(IsEmpty()));

View File

@ -154,34 +154,28 @@ class DeviceDescription {
// Returns the limit on the total number of threads that can be launched in a
// single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
// This limit affects what constitutes a legitimate kernel launch request.
const int64_t& threads_per_block_limit() const {
return threads_per_block_limit_;
}
int64_t threads_per_block_limit() const { return threads_per_block_limit_; }
// Returns the limit on the total number of threads that can be simultaneously
// launched on a given multiprocessor.
const int64_t& threads_per_core_limit() const {
return threads_per_core_limit_;
}
int64_t threads_per_core_limit() const { return threads_per_core_limit_; }
// Returns the number of threads per warp/wavefront.
constexpr int64_t threads_per_warp() const { return threads_per_warp_; }
// Returns the limit on the total number of registers per core.
const int64_t& registers_per_core_limit() const {
return registers_per_core_limit_;
}
int64_t registers_per_core_limit() const { return registers_per_core_limit_; }
// Returns the limit on the total number of registers that can be
// simultaneously used by a block.
const int64_t& registers_per_block_limit() const {
int64_t registers_per_block_limit() const {
return registers_per_block_limit_;
}
// Returns the number of address bits available to kernel code running on the
// platform. This affects things like the maximum allocation size and perhaps
// types used in kernel code such as size_t.
const int64_t& device_address_bits() const { return device_address_bits_; }
int64_t device_address_bits() const { return device_address_bits_; }
// Returns the device memory size in bytes.
int64_t device_memory_size() const { return device_memory_size_; }
@ -194,6 +188,9 @@ class DeviceDescription {
// host and device.)
int64_t memory_bandwidth() const { return memory_bandwidth_; }
// Returns the PCIe memory bandwidth in bytes/sec.
int64_t pcie_bandwidth() const { return pcie_bandwidth_; }
// Returns the device's core clock rate in GHz.
float clock_rate_ghz() const { return clock_rate_ghz_; }
@ -340,6 +337,7 @@ class DeviceDescription {
void set_device_memory_size(int64_t value) { device_memory_size_ = value; }
void set_l2_cache_size(int64_t value) { l2_cache_size_ = value; }
void set_memory_bandwidth(int64_t value) { memory_bandwidth_ = value; }
void set_pcie_bandwidth(int64_t value) { pcie_bandwidth_ = value; }
void set_shared_memory_per_core(int64_t value) {
shared_memory_per_core_ = value;
@ -400,7 +398,9 @@ class DeviceDescription {
int64_t device_address_bits_ = kUninitialized<int64_t>;
int64_t device_memory_size_ = kUninitialized<int64_t>;
int64_t l2_cache_size_ = kUninitialized<int64_t>;
int64_t memory_bandwidth_ = kUninitialized<int64_t>;
int64_t pcie_bandwidth_ = kUninitialized<int64_t>;
// Shared memory limits on a given device.
int64_t shared_memory_per_core_ = kUninitialized<int64_t>;

View File

@ -1121,6 +1121,9 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
desc.set_l2_cache_size(prop.l2CacheSize);
}
// PCIe bandwidth for PCI Gen4 x16 (approximate)
desc.set_pcie_bandwidth(32LL * 1024 * 1024 * 1024);
{
auto ecc_enabled_or = IsEccEnabled(device);
if (!ecc_enabled_or.ok()) {