[XLA:GPU] Allow to map slice of memory with multicast object.

PiperOrigin-RevId: 826013717
This commit is contained in:
A. Unique TensorFlower 2025-10-30 06:59:03 -07:00 committed by TensorFlower Gardener
parent 181ff64d18
commit ca45a1e4bb
6 changed files with 134 additions and 28 deletions

View File

@ -163,9 +163,9 @@ class AllReduceKernelTest : public ::testing::Test,
stream_executor::gpu::GpuExecutor* gpu_executor =
dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[i]);
TF_RET_CHECK(gpu_executor != nullptr);
TF_ASSIGN_OR_RETURN(void* mapped_memory,
multicast_memory->MapMemory(
allocated_buffers[i].opaque(), gpu_executor));
TF_ASSIGN_OR_RETURN(
void* mapped_memory,
multicast_memory->MapMemory(allocated_buffers[i], gpu_executor));
metadata.multicast_buffer_ptr = (uint64_t)mapped_memory;
} else {
metadata.multicast_buffer_ptr = 0;

View File

@ -806,6 +806,15 @@ CudaExecutor::RetainVmmMemoryHandle(void* ptr) {
return CudaExecutor::VmmMemoryHandle(static_cast<uint64_t>(handle));
}
absl::StatusOr<size_t> CudaExecutor::GetVmmGranularity() const {
CUmemAllocationProp properties =
GetVmmAllocationProperties(device_, is_rdma_supported_);
size_t granularity = 0;
TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemGetAllocationGranularity(
&granularity, &properties, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)));
return granularity;
}
absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
if (!is_vmm_supported_) {
return absl::InternalError("VMM is not supported on this device.");
@ -1893,13 +1902,13 @@ absl::Status CudaExecutor::CudaMulticastMemory::SubscribeDevice(
}
absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
void* device_ptr, GpuExecutor* gpu_executor) {
const DeviceMemoryBase& location, GpuExecutor* gpu_executor) {
CudaExecutor* cuda_executor = dynamic_cast<CudaExecutor*>(gpu_executor);
if (cuda_executor == nullptr) {
return absl::InvalidArgumentError("GpuExecutor is not a CudaExecutor.");
}
if (device_ptr == nullptr) {
if (location.is_null()) {
return absl::InvalidArgumentError("Device pointer is null.");
}
@ -1914,20 +1923,26 @@ absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
TF_ASSIGN_OR_RETURN(
stream_executor::gpu::CudaExecutor::VmmMemoryHandle memory_handle,
cuda_executor->RetainVmmMemoryHandle(device_ptr));
cuda_executor->RetainVmmMemoryHandle(location.opaque()));
CUmemGenericAllocationHandle retained_memory_handle =
static_cast<CUmemGenericAllocationHandle>(memory_handle.handle());
TF_ASSIGN_OR_RETURN(auto base_address,
cuda_executor->GetMemoryRange(location));
uint64_t offset = reinterpret_cast<uint64_t>(location.opaque()) -
reinterpret_cast<uint64_t>(base_address.opaque());
// Bind the memory to the multicast object.
TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
cuMulticastBindMem(handle_, /*mcOffset=*/0, retained_memory_handle,
/*memOffset=*/0, padded_size_, /*flags=*/0)));
/*memOffset=*/offset, padded_size_, /*flags=*/0)));
VLOG(3) << "[" << static_cast<int>(cuda_executor->device_)
<< "] Mapped multicast memory: " << static_cast<uint64_t>(handle_)
<< " size: " << padded_size_ << " with granularity: " << granularity_
<< " to address: " << device_ptr;
<< " to address: " << location.opaque()
<< " offset from base range: " << offset;
// Map a virtual address range for the multicast memory. Multicast
// memory is used to reduce the data stored in the multicast object.

View File

@ -17,6 +17,7 @@ limitations under the License.
#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <map>
#include <memory>
@ -138,6 +139,11 @@ class CudaExecutor : public GpuExecutor {
absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
MemoryType type) override;
// Returns the granularity which is the minimum unit of memory that can be
// allocated with VMM API. In order to map the memory slices to multicast
// object, the offset of the slices should be aligned with this granularity.
absl::StatusOr<size_t> GetVmmGranularity() const;
// RAII wrapper for a VMM memory handle.
class VmmMemoryHandle {
public:
@ -167,7 +173,7 @@ class CudaExecutor : public GpuExecutor {
absl::Status SubscribeDevice(int device_number) override;
absl::StatusOr<void*> MapMemory(void* device_ptr,
absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
GpuExecutor* gpu_executor) override;
private:

View File

@ -42,18 +42,20 @@ using ::testing::NotNull;
template <typename T>
absl::StatusOr<stream_executor::DeviceMemoryBase> AllocateInitializedMemory(
CudaExecutor* executor, size_t size, T value) {
size_t num_elements = size / sizeof(T);
CudaExecutor* executor, size_t size, size_t offset, T value) {
stream_executor::DeviceMemoryBase device_memory = executor->Allocate(
size, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
size + offset, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
if (device_memory.opaque() == nullptr) {
return absl::InternalError("Failed to allocate memory.");
}
std::vector<T> device_memory_vector(num_elements, value);
size_t num_initialized_elements = size / sizeof(T);
std::vector<T> device_memory_vector(num_initialized_elements, value);
auto stride_memory = device_memory.GetByteSlice(offset, size);
TF_RETURN_IF_ERROR(executor->SynchronousMemcpy(
&device_memory, device_memory_vector.data(), size));
return device_memory;
&stride_memory, device_memory_vector.data(), size));
return stride_memory;
}
template <typename T>
@ -105,8 +107,8 @@ TEST(CudaExecutorMultiGpuTest, AllDevicesMustBeSubscribedBeforeMapping) {
TF_ASSERT_OK_AND_ASSIGN(multicast_memory,
executors[0]->CreateMulticastMemory(1024, 2));
EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
EXPECT_THAT(
multicast_memory->MapMemory(reinterpret_cast<void*>(1), executors[0]),
DeviceMemoryBase device_memory(reinterpret_cast<void*>(1), 1);
EXPECT_THAT(multicast_memory->MapMemory(device_memory, executors[0]),
StatusIs(absl::StatusCode::kFailedPrecondition,
"All devices should be subscribed."));
;
@ -146,7 +148,7 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingNonVmmMemory) {
DeviceMemoryBase device_memory = executors[0]->Allocate(8, 0);
EXPECT_THAT(
multicast_memory->MapMemory(device_memory.opaque(), executors[0]),
multicast_memory->MapMemory(device_memory, executors[0]),
StatusIs(absl::StatusCode::kInternal,
"CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
}
@ -170,19 +172,18 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingVmmMemory) {
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase first_device_memory,
AllocateInitializedMemory(executors[0], kMemorySize, kValue));
AllocateInitializedMemory(executors[0], kMemorySize, 0, kValue));
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase output_device_memory,
AllocateInitializedMemory(executors[0], kMemorySize, 0));
AllocateInitializedMemory(executors[0], kMemorySize, 0, 0));
TF_ASSERT_OK_AND_ASSIGN(
void* first_device_multicast_ptr,
multicast_memory->MapMemory(first_device_memory.opaque(), executors[0]));
multicast_memory->MapMemory(first_device_memory, executors[0]));
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase second_device_memory,
AllocateInitializedMemory(executors[1], kMemorySize, kValue));
EXPECT_THAT(
multicast_memory->MapMemory(second_device_memory.opaque(), executors[1]),
AllocateInitializedMemory(executors[1], kMemorySize, 0, kValue));
EXPECT_THAT(multicast_memory->MapMemory(second_device_memory, executors[1]),
IsOkAndHolds(NotNull()));
EXPECT_THAT(
@ -195,5 +196,87 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingVmmMemory) {
IsOk());
}
TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlicesUnaligned) {
std::vector<CudaExecutor*> executors = {
static_cast<CudaExecutor*>(GetGpuExecutor(0)),
static_cast<CudaExecutor*>(GetGpuExecutor(1))};
if (!executors[0]->is_multicast_supported()) {
GTEST_SKIP() << "Test requires multicast support.";
}
const int64_t kNumDevices = 2;
const int64_t kNumElements = 8;
const int64_t kMappedMemorySize = kNumElements * sizeof(int);
const int kValue = 2;
std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
TF_ASSERT_OK_AND_ASSIGN(
multicast_memory,
executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
executors[0]->GetVmmGranularity());
// Allocate memory with unaligned offset.
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase first_device_mapped_memory,
AllocateInitializedMemory(
executors[0],
// Add granularity to make sure that there is
// enough memory after adding offset to map with multicast object.
kMappedMemorySize + vmm_granularity, kMappedMemorySize, kValue));
EXPECT_THAT(
multicast_memory->MapMemory(first_device_mapped_memory, executors[0]),
StatusIs(absl::StatusCode::kInternal,
"CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
}
// Slices mapping works only when offset is aligned with the VMM granularity.
TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlices) {
std::vector<CudaExecutor*> executors = {
static_cast<CudaExecutor*>(GetGpuExecutor(0)),
static_cast<CudaExecutor*>(GetGpuExecutor(1))};
if (!executors[0]->is_multicast_supported()) {
GTEST_SKIP() << "Test requires multicast support.";
}
const int64_t kNumDevices = 2;
const int64_t kNumElements = 8;
const int64_t kMappedMemorySize = kNumElements * sizeof(int);
const int kValue = 2;
std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
TF_ASSERT_OK_AND_ASSIGN(
multicast_memory,
executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
executors[0]->GetVmmGranularity());
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase first_device_mapped_memory,
AllocateInitializedMemory(executors[0], kMappedMemorySize,
vmm_granularity, kValue));
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase output_device_memory,
AllocateInitializedMemory(executors[0], kMappedMemorySize, 0, 0));
TF_ASSERT_OK_AND_ASSIGN(
void* first_device_multicast_ptr,
multicast_memory->MapMemory(first_device_mapped_memory, executors[0]));
TF_ASSERT_OK_AND_ASSIGN(
stream_executor::DeviceMemoryBase second_device_mapped_memory,
AllocateInitializedMemory(executors[1], kMappedMemorySize, 0, kValue));
EXPECT_THAT(
multicast_memory->MapMemory(second_device_mapped_memory, executors[1]),
IsOkAndHolds(NotNull()));
EXPECT_THAT(
MulticastReduce((int*)first_device_multicast_ptr,
(int*)output_device_memory.opaque(), kNumElements),
IsOk());
const int kExpectedValue = kValue * kNumDevices;
EXPECT_THAT(CheckMemory(executors[0], output_device_memory, kExpectedValue),
IsOk());
}
} // namespace
} // namespace stream_executor::gpu

View File

@ -168,6 +168,7 @@ cc_library(
name = "gpu_executor_header",
hdrs = ["gpu_executor.h"],
deps = [
"//xla/stream_executor:device_memory",
"//xla/stream_executor:platform",
"//xla/stream_executor:stream_executor_common",
"//xla/stream_executor:stream_executor_h",

View File

@ -26,6 +26,7 @@ limitations under the License.
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/synchronization/mutex.h"
#include "xla/stream_executor/device_memory.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_common.h"
@ -78,7 +79,7 @@ class GpuExecutor : public StreamExecutorCommon {
return absl::UnimplementedError("SubscribeDevice is not implemented.");
}
virtual absl::StatusOr<void*> MapMemory(void* device_ptr,
virtual absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
GpuExecutor* gpu_executor) {
return absl::UnimplementedError("MapMemory is not implemented.");
}