mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 00:19:58 +01:00
add pcie_bandwidth field to DeviceDescription
PiperOrigin-RevId: 824738638
This commit is contained in:
parent
769acdd784
commit
699879f5f3
|
|
@ -112,9 +112,6 @@ struct CudaBandwidthSettings {
|
|||
static constexpr double kSm80NvlinkBandwidth = 20.0;
|
||||
static constexpr double kSm90NvlinkBandwidth = 20.0;
|
||||
|
||||
// PCIE bandwidth for PCI Gen3 x16
|
||||
static constexpr double kPciBandwidth = 12.0;
|
||||
|
||||
// Discount factor for ring algorithm
|
||||
static constexpr double kRingAlgorithmDiscountFactor = 0.92;
|
||||
|
||||
|
|
@ -211,9 +208,6 @@ struct RocmBandwidthSettings {
|
|||
static constexpr double kMi200InfinityFabricBandwidth = 75.0;
|
||||
static constexpr double kMi300InfinityFabricBandwidth = 112.0;
|
||||
|
||||
// PCIe bandwidth for PCI Gen4 x16 (approximate)
|
||||
static constexpr double kPciBandwidth = 32.0;
|
||||
|
||||
// Discount factor for ring algorithm (based on ROCm NCCL implementation)
|
||||
static constexpr double kRingAlgorithmDiscountFactor = 0.90;
|
||||
|
||||
|
|
@ -321,10 +315,11 @@ absl::Duration ComputeAllreduceTimeImpl(
|
|||
std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
|
||||
int64_t num_channels =
|
||||
std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
|
||||
int default_threads =
|
||||
(bw_intra_node * num_channels <= bandwidth_settings.kPciBandwidth)
|
||||
? 256
|
||||
: bandwidth_settings.kLL128NumThreads;
|
||||
int64_t pcie_bandwidth_gbps =
|
||||
gpu_device_info.pcie_bandwidth() / 1024 / 1024 / 1024;
|
||||
int default_threads = (bw_intra_node * num_channels <= pcie_bandwidth_gbps)
|
||||
? 256
|
||||
: bandwidth_settings.kLL128NumThreads;
|
||||
|
||||
int warp_size = gpu_device_info.threads_per_warp();
|
||||
int num_threads =
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ limitations under the License.
|
|||
#include "third_party/gpus/cuda/include/cuda.h"
|
||||
#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
|
||||
#include "third_party/gpus/cuda/include/driver_types.h"
|
||||
#include "third_party/gpus/cuda/nvml/include/nvml.h"
|
||||
#include "xla/backends/gpu/collectives/gpu_collectives.h"
|
||||
#include "xla/core/collectives/collectives.h"
|
||||
#include "xla/core/collectives/collectives_registry.h"
|
||||
|
|
@ -690,6 +691,39 @@ absl::StatusOr<CUmulticastObjectProp> CreateMulticastObjectProperties(
|
|||
return multicast_properties;
|
||||
}
|
||||
|
||||
absl::StatusOr<int64_t> GetDevicePcieBandwidth(int device_ordinal) {
|
||||
nvmlDevice_t nvml_device;
|
||||
nvmlReturn_t result =
|
||||
nvmlDeviceGetHandleByIndex(device_ordinal, &nvml_device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
return absl::InternalError(
|
||||
absl::StrCat("nvmlDeviceGetHandleByIndex failed with ", result));
|
||||
}
|
||||
|
||||
// nvmlDeviceGetPcieSpeed returns wrong information. Verified with
|
||||
// nvbandwidth.
|
||||
unsigned int link_gen, link_width;
|
||||
result = nvmlDeviceGetCurrPcieLinkGeneration(nvml_device, &link_gen);
|
||||
if (result != NVML_SUCCESS) {
|
||||
return absl::InternalError(absl::StrCat(
|
||||
"nvmlDeviceGetCurrPcieLinkGeneration failed with ", result));
|
||||
}
|
||||
|
||||
result = nvmlDeviceGetCurrPcieLinkWidth(nvml_device, &link_width);
|
||||
if (result != NVML_SUCCESS) {
|
||||
return absl::InternalError(
|
||||
absl::StrCat("nvmlDeviceGetCurrPcieLinkWidth failed with ", result));
|
||||
}
|
||||
|
||||
// PCIe v1 single lane speed. 0.25 GB/s
|
||||
int64_t lane_speed = 0.25 * 1024 * 1024 * 1024;
|
||||
for (int i = 1; i < link_gen; i++) {
|
||||
lane_speed *= 2;
|
||||
}
|
||||
|
||||
return lane_speed * link_width;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// Given const GPU memory, returns a libcuda device pointer datatype, suitable
|
||||
|
|
@ -1629,6 +1663,18 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
|
|||
int64_t{mem_bus_width_bits.value()} / 8);
|
||||
}
|
||||
|
||||
{
|
||||
absl::StatusOr<int64_t> status_or_bandwidth =
|
||||
GetDevicePcieBandwidth(device_ordinal);
|
||||
if (status_or_bandwidth.ok()) {
|
||||
desc.set_pcie_bandwidth(*status_or_bandwidth);
|
||||
} else {
|
||||
LOG(ERROR) << status_or_bandwidth.status().message()
|
||||
<< " Assuming PCIe gen 3 x16 bandwidth.";
|
||||
status_or_bandwidth = 16LL * 1024 * 1024 * 1024;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
BlockDim block_dim_limit;
|
||||
TF_RETURN_IF_ERROR(FillBlockDimLimit(device, &block_dim_limit));
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ TEST(CudaExecutorTest, CreateDeviceDescription) {
|
|||
EXPECT_NE(result->driver_version(), kNullVersion);
|
||||
EXPECT_NE(result->compile_time_toolkit_version(), kNullVersion);
|
||||
|
||||
EXPECT_GT(result->pcie_bandwidth(), 1024 * 1024);
|
||||
EXPECT_THAT(result->platform_version(), Not(IsEmpty()));
|
||||
EXPECT_THAT(result->name(), Not(IsEmpty()));
|
||||
EXPECT_THAT(result->model_str(), Not(IsEmpty()));
|
||||
|
|
|
|||
|
|
@ -154,34 +154,28 @@ class DeviceDescription {
|
|||
// Returns the limit on the total number of threads that can be launched in a
|
||||
// single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
|
||||
// This limit affects what constitutes a legitimate kernel launch request.
|
||||
const int64_t& threads_per_block_limit() const {
|
||||
return threads_per_block_limit_;
|
||||
}
|
||||
int64_t threads_per_block_limit() const { return threads_per_block_limit_; }
|
||||
|
||||
// Returns the limit on the total number of threads that can be simultaneously
|
||||
// launched on a given multiprocessor.
|
||||
const int64_t& threads_per_core_limit() const {
|
||||
return threads_per_core_limit_;
|
||||
}
|
||||
int64_t threads_per_core_limit() const { return threads_per_core_limit_; }
|
||||
|
||||
// Returns the number of threads per warp/wavefront.
|
||||
constexpr int64_t threads_per_warp() const { return threads_per_warp_; }
|
||||
|
||||
// Returns the limit on the total number of registers per core.
|
||||
const int64_t& registers_per_core_limit() const {
|
||||
return registers_per_core_limit_;
|
||||
}
|
||||
int64_t registers_per_core_limit() const { return registers_per_core_limit_; }
|
||||
|
||||
// Returns the limit on the total number of registers that can be
|
||||
// simultaneously used by a block.
|
||||
const int64_t& registers_per_block_limit() const {
|
||||
int64_t registers_per_block_limit() const {
|
||||
return registers_per_block_limit_;
|
||||
}
|
||||
|
||||
// Returns the number of address bits available to kernel code running on the
|
||||
// platform. This affects things like the maximum allocation size and perhaps
|
||||
// types used in kernel code such as size_t.
|
||||
const int64_t& device_address_bits() const { return device_address_bits_; }
|
||||
int64_t device_address_bits() const { return device_address_bits_; }
|
||||
|
||||
// Returns the device memory size in bytes.
|
||||
int64_t device_memory_size() const { return device_memory_size_; }
|
||||
|
|
@ -194,6 +188,9 @@ class DeviceDescription {
|
|||
// host and device.)
|
||||
int64_t memory_bandwidth() const { return memory_bandwidth_; }
|
||||
|
||||
// Returns the PCIe memory bandwidth in bytes/sec.
|
||||
int64_t pcie_bandwidth() const { return pcie_bandwidth_; }
|
||||
|
||||
// Returns the device's core clock rate in GHz.
|
||||
float clock_rate_ghz() const { return clock_rate_ghz_; }
|
||||
|
||||
|
|
@ -340,6 +337,7 @@ class DeviceDescription {
|
|||
void set_device_memory_size(int64_t value) { device_memory_size_ = value; }
|
||||
void set_l2_cache_size(int64_t value) { l2_cache_size_ = value; }
|
||||
void set_memory_bandwidth(int64_t value) { memory_bandwidth_ = value; }
|
||||
void set_pcie_bandwidth(int64_t value) { pcie_bandwidth_ = value; }
|
||||
|
||||
void set_shared_memory_per_core(int64_t value) {
|
||||
shared_memory_per_core_ = value;
|
||||
|
|
@ -400,7 +398,9 @@ class DeviceDescription {
|
|||
int64_t device_address_bits_ = kUninitialized<int64_t>;
|
||||
int64_t device_memory_size_ = kUninitialized<int64_t>;
|
||||
int64_t l2_cache_size_ = kUninitialized<int64_t>;
|
||||
|
||||
int64_t memory_bandwidth_ = kUninitialized<int64_t>;
|
||||
int64_t pcie_bandwidth_ = kUninitialized<int64_t>;
|
||||
|
||||
// Shared memory limits on a given device.
|
||||
int64_t shared_memory_per_core_ = kUninitialized<int64_t>;
|
||||
|
|
|
|||
|
|
@ -1121,6 +1121,9 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
|
|||
desc.set_l2_cache_size(prop.l2CacheSize);
|
||||
}
|
||||
|
||||
// PCIe bandwidth for PCI Gen4 x16 (approximate)
|
||||
desc.set_pcie_bandwidth(32LL * 1024 * 1024 * 1024);
|
||||
|
||||
{
|
||||
auto ecc_enabled_or = IsEccEnabled(device);
|
||||
if (!ecc_enabled_or.ok()) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user