mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
[XLA:GPU] Allow to map slice of memory with multicast object.
PiperOrigin-RevId: 826013717
This commit is contained in:
parent
181ff64d18
commit
ca45a1e4bb
|
|
@ -163,9 +163,9 @@ class AllReduceKernelTest : public ::testing::Test,
|
|||
stream_executor::gpu::GpuExecutor* gpu_executor =
|
||||
dynamic_cast<stream_executor::gpu::GpuExecutor*>(executors[i]);
|
||||
TF_RET_CHECK(gpu_executor != nullptr);
|
||||
TF_ASSIGN_OR_RETURN(void* mapped_memory,
|
||||
multicast_memory->MapMemory(
|
||||
allocated_buffers[i].opaque(), gpu_executor));
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
void* mapped_memory,
|
||||
multicast_memory->MapMemory(allocated_buffers[i], gpu_executor));
|
||||
metadata.multicast_buffer_ptr = (uint64_t)mapped_memory;
|
||||
} else {
|
||||
metadata.multicast_buffer_ptr = 0;
|
||||
|
|
|
|||
|
|
@ -806,6 +806,15 @@ CudaExecutor::RetainVmmMemoryHandle(void* ptr) {
|
|||
return CudaExecutor::VmmMemoryHandle(static_cast<uint64_t>(handle));
|
||||
}
|
||||
|
||||
absl::StatusOr<size_t> CudaExecutor::GetVmmGranularity() const {
|
||||
CUmemAllocationProp properties =
|
||||
GetVmmAllocationProperties(device_, is_rdma_supported_);
|
||||
size_t granularity = 0;
|
||||
TF_RETURN_IF_ERROR(cuda::ToStatus(cuMemGetAllocationGranularity(
|
||||
&granularity, &properties, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)));
|
||||
return granularity;
|
||||
}
|
||||
|
||||
absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
|
||||
if (!is_vmm_supported_) {
|
||||
return absl::InternalError("VMM is not supported on this device.");
|
||||
|
|
@ -1893,13 +1902,13 @@ absl::Status CudaExecutor::CudaMulticastMemory::SubscribeDevice(
|
|||
}
|
||||
|
||||
absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
|
||||
void* device_ptr, GpuExecutor* gpu_executor) {
|
||||
const DeviceMemoryBase& location, GpuExecutor* gpu_executor) {
|
||||
CudaExecutor* cuda_executor = dynamic_cast<CudaExecutor*>(gpu_executor);
|
||||
if (cuda_executor == nullptr) {
|
||||
return absl::InvalidArgumentError("GpuExecutor is not a CudaExecutor.");
|
||||
}
|
||||
|
||||
if (device_ptr == nullptr) {
|
||||
if (location.is_null()) {
|
||||
return absl::InvalidArgumentError("Device pointer is null.");
|
||||
}
|
||||
|
||||
|
|
@ -1914,20 +1923,26 @@ absl::StatusOr<void*> CudaExecutor::CudaMulticastMemory::MapMemory(
|
|||
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
stream_executor::gpu::CudaExecutor::VmmMemoryHandle memory_handle,
|
||||
cuda_executor->RetainVmmMemoryHandle(device_ptr));
|
||||
cuda_executor->RetainVmmMemoryHandle(location.opaque()));
|
||||
|
||||
CUmemGenericAllocationHandle retained_memory_handle =
|
||||
static_cast<CUmemGenericAllocationHandle>(memory_handle.handle());
|
||||
|
||||
TF_ASSIGN_OR_RETURN(auto base_address,
|
||||
cuda_executor->GetMemoryRange(location));
|
||||
uint64_t offset = reinterpret_cast<uint64_t>(location.opaque()) -
|
||||
reinterpret_cast<uint64_t>(base_address.opaque());
|
||||
|
||||
// Bind the memory to the multicast object.
|
||||
TF_RETURN_IF_ERROR(stream_executor::cuda::ToStatus(
|
||||
cuMulticastBindMem(handle_, /*mcOffset=*/0, retained_memory_handle,
|
||||
/*memOffset=*/0, padded_size_, /*flags=*/0)));
|
||||
/*memOffset=*/offset, padded_size_, /*flags=*/0)));
|
||||
|
||||
VLOG(3) << "[" << static_cast<int>(cuda_executor->device_)
|
||||
<< "] Mapped multicast memory: " << static_cast<uint64_t>(handle_)
|
||||
<< " size: " << padded_size_ << " with granularity: " << granularity_
|
||||
<< " to address: " << device_ptr;
|
||||
<< " to address: " << location.opaque()
|
||||
<< " offset from base range: " << offset;
|
||||
|
||||
// Map a virtual address range for the multicast memory. Multicast
|
||||
// memory is used to reduce the data stored in the multicast object.
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||
#define XLA_STREAM_EXECUTOR_CUDA_CUDA_EXECUTOR_H_
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
|
@ -138,6 +139,11 @@ class CudaExecutor : public GpuExecutor {
|
|||
absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
|
||||
MemoryType type) override;
|
||||
|
||||
// Returns the granularity which is the minimum unit of memory that can be
|
||||
// allocated with VMM API. In order to map the memory slices to multicast
|
||||
// object, the offset of the slices should be aligned with this granularity.
|
||||
absl::StatusOr<size_t> GetVmmGranularity() const;
|
||||
|
||||
// RAII wrapper for a VMM memory handle.
|
||||
class VmmMemoryHandle {
|
||||
public:
|
||||
|
|
@ -167,7 +173,7 @@ class CudaExecutor : public GpuExecutor {
|
|||
|
||||
absl::Status SubscribeDevice(int device_number) override;
|
||||
|
||||
absl::StatusOr<void*> MapMemory(void* device_ptr,
|
||||
absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
|
||||
GpuExecutor* gpu_executor) override;
|
||||
|
||||
private:
|
||||
|
|
|
|||
|
|
@ -42,18 +42,20 @@ using ::testing::NotNull;
|
|||
|
||||
template <typename T>
|
||||
absl::StatusOr<stream_executor::DeviceMemoryBase> AllocateInitializedMemory(
|
||||
CudaExecutor* executor, size_t size, T value) {
|
||||
size_t num_elements = size / sizeof(T);
|
||||
CudaExecutor* executor, size_t size, size_t offset, T value) {
|
||||
stream_executor::DeviceMemoryBase device_memory = executor->Allocate(
|
||||
size, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
|
||||
size + offset, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
|
||||
if (device_memory.opaque() == nullptr) {
|
||||
return absl::InternalError("Failed to allocate memory.");
|
||||
}
|
||||
std::vector<T> device_memory_vector(num_elements, value);
|
||||
|
||||
size_t num_initialized_elements = size / sizeof(T);
|
||||
std::vector<T> device_memory_vector(num_initialized_elements, value);
|
||||
|
||||
auto stride_memory = device_memory.GetByteSlice(offset, size);
|
||||
TF_RETURN_IF_ERROR(executor->SynchronousMemcpy(
|
||||
&device_memory, device_memory_vector.data(), size));
|
||||
return device_memory;
|
||||
&stride_memory, device_memory_vector.data(), size));
|
||||
return stride_memory;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -105,10 +107,10 @@ TEST(CudaExecutorMultiGpuTest, AllDevicesMustBeSubscribedBeforeMapping) {
|
|||
TF_ASSERT_OK_AND_ASSIGN(multicast_memory,
|
||||
executors[0]->CreateMulticastMemory(1024, 2));
|
||||
EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
|
||||
EXPECT_THAT(
|
||||
multicast_memory->MapMemory(reinterpret_cast<void*>(1), executors[0]),
|
||||
StatusIs(absl::StatusCode::kFailedPrecondition,
|
||||
"All devices should be subscribed."));
|
||||
DeviceMemoryBase device_memory(reinterpret_cast<void*>(1), 1);
|
||||
EXPECT_THAT(multicast_memory->MapMemory(device_memory, executors[0]),
|
||||
StatusIs(absl::StatusCode::kFailedPrecondition,
|
||||
"All devices should be subscribed."));
|
||||
;
|
||||
}
|
||||
|
||||
|
|
@ -146,7 +148,7 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingNonVmmMemory) {
|
|||
|
||||
DeviceMemoryBase device_memory = executors[0]->Allocate(8, 0);
|
||||
EXPECT_THAT(
|
||||
multicast_memory->MapMemory(device_memory.opaque(), executors[0]),
|
||||
multicast_memory->MapMemory(device_memory, executors[0]),
|
||||
StatusIs(absl::StatusCode::kInternal,
|
||||
"CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
|
||||
}
|
||||
|
|
@ -170,20 +172,19 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingVmmMemory) {
|
|||
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase first_device_memory,
|
||||
AllocateInitializedMemory(executors[0], kMemorySize, kValue));
|
||||
AllocateInitializedMemory(executors[0], kMemorySize, 0, kValue));
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase output_device_memory,
|
||||
AllocateInitializedMemory(executors[0], kMemorySize, 0));
|
||||
AllocateInitializedMemory(executors[0], kMemorySize, 0, 0));
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
void* first_device_multicast_ptr,
|
||||
multicast_memory->MapMemory(first_device_memory.opaque(), executors[0]));
|
||||
multicast_memory->MapMemory(first_device_memory, executors[0]));
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase second_device_memory,
|
||||
AllocateInitializedMemory(executors[1], kMemorySize, kValue));
|
||||
EXPECT_THAT(
|
||||
multicast_memory->MapMemory(second_device_memory.opaque(), executors[1]),
|
||||
IsOkAndHolds(NotNull()));
|
||||
AllocateInitializedMemory(executors[1], kMemorySize, 0, kValue));
|
||||
EXPECT_THAT(multicast_memory->MapMemory(second_device_memory, executors[1]),
|
||||
IsOkAndHolds(NotNull()));
|
||||
|
||||
EXPECT_THAT(
|
||||
MulticastReduce((int*)first_device_multicast_ptr,
|
||||
|
|
@ -195,5 +196,87 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingVmmMemory) {
|
|||
IsOk());
|
||||
}
|
||||
|
||||
TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlicesUnaligned) {
|
||||
std::vector<CudaExecutor*> executors = {
|
||||
static_cast<CudaExecutor*>(GetGpuExecutor(0)),
|
||||
static_cast<CudaExecutor*>(GetGpuExecutor(1))};
|
||||
if (!executors[0]->is_multicast_supported()) {
|
||||
GTEST_SKIP() << "Test requires multicast support.";
|
||||
}
|
||||
const int64_t kNumDevices = 2;
|
||||
const int64_t kNumElements = 8;
|
||||
const int64_t kMappedMemorySize = kNumElements * sizeof(int);
|
||||
const int kValue = 2;
|
||||
std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
multicast_memory,
|
||||
executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
|
||||
EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
|
||||
EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
|
||||
executors[0]->GetVmmGranularity());
|
||||
// Allocate memory with unaligned offset.
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase first_device_mapped_memory,
|
||||
AllocateInitializedMemory(
|
||||
executors[0],
|
||||
// Add granularity to make sure that there is
|
||||
// enough memory after adding offset to map with multicast object.
|
||||
kMappedMemorySize + vmm_granularity, kMappedMemorySize, kValue));
|
||||
EXPECT_THAT(
|
||||
multicast_memory->MapMemory(first_device_mapped_memory, executors[0]),
|
||||
StatusIs(absl::StatusCode::kInternal,
|
||||
"CUDA error: : CUDA_ERROR_INVALID_VALUE: invalid argument"));
|
||||
}
|
||||
|
||||
// Slices mapping works only when offset is aligned with the VMM granularity.
|
||||
TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryMapDifferentSlices) {
|
||||
std::vector<CudaExecutor*> executors = {
|
||||
static_cast<CudaExecutor*>(GetGpuExecutor(0)),
|
||||
static_cast<CudaExecutor*>(GetGpuExecutor(1))};
|
||||
if (!executors[0]->is_multicast_supported()) {
|
||||
GTEST_SKIP() << "Test requires multicast support.";
|
||||
}
|
||||
const int64_t kNumDevices = 2;
|
||||
const int64_t kNumElements = 8;
|
||||
const int64_t kMappedMemorySize = kNumElements * sizeof(int);
|
||||
const int kValue = 2;
|
||||
std::unique_ptr<CudaExecutor::MulticastMemory> multicast_memory;
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
multicast_memory,
|
||||
executors[0]->CreateMulticastMemory(kMappedMemorySize, kNumDevices));
|
||||
EXPECT_THAT(multicast_memory->SubscribeDevice(0), IsOk());
|
||||
EXPECT_THAT(multicast_memory->SubscribeDevice(1), IsOk());
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(size_t vmm_granularity,
|
||||
executors[0]->GetVmmGranularity());
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase first_device_mapped_memory,
|
||||
AllocateInitializedMemory(executors[0], kMappedMemorySize,
|
||||
vmm_granularity, kValue));
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase output_device_memory,
|
||||
AllocateInitializedMemory(executors[0], kMappedMemorySize, 0, 0));
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
void* first_device_multicast_ptr,
|
||||
multicast_memory->MapMemory(first_device_mapped_memory, executors[0]));
|
||||
|
||||
TF_ASSERT_OK_AND_ASSIGN(
|
||||
stream_executor::DeviceMemoryBase second_device_mapped_memory,
|
||||
AllocateInitializedMemory(executors[1], kMappedMemorySize, 0, kValue));
|
||||
EXPECT_THAT(
|
||||
multicast_memory->MapMemory(second_device_mapped_memory, executors[1]),
|
||||
IsOkAndHolds(NotNull()));
|
||||
|
||||
EXPECT_THAT(
|
||||
MulticastReduce((int*)first_device_multicast_ptr,
|
||||
(int*)output_device_memory.opaque(), kNumElements),
|
||||
IsOk());
|
||||
|
||||
const int kExpectedValue = kValue * kNumDevices;
|
||||
EXPECT_THAT(CheckMemory(executors[0], output_device_memory, kExpectedValue),
|
||||
IsOk());
|
||||
}
|
||||
} // namespace
|
||||
} // namespace stream_executor::gpu
|
||||
|
|
|
|||
|
|
@ -168,6 +168,7 @@ cc_library(
|
|||
name = "gpu_executor_header",
|
||||
hdrs = ["gpu_executor.h"],
|
||||
deps = [
|
||||
"//xla/stream_executor:device_memory",
|
||||
"//xla/stream_executor:platform",
|
||||
"//xla/stream_executor:stream_executor_common",
|
||||
"//xla/stream_executor:stream_executor_h",
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ limitations under the License.
|
|||
#include "absl/status/status.h"
|
||||
#include "absl/status/statusor.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "xla/stream_executor/device_memory.h"
|
||||
#include "xla/stream_executor/platform.h"
|
||||
#include "xla/stream_executor/stream_executor.h"
|
||||
#include "xla/stream_executor/stream_executor_common.h"
|
||||
|
|
@ -78,7 +79,7 @@ class GpuExecutor : public StreamExecutorCommon {
|
|||
return absl::UnimplementedError("SubscribeDevice is not implemented.");
|
||||
}
|
||||
|
||||
virtual absl::StatusOr<void*> MapMemory(void* device_ptr,
|
||||
virtual absl::StatusOr<void*> MapMemory(const DeviceMemoryBase& location,
|
||||
GpuExecutor* gpu_executor) {
|
||||
return absl::UnimplementedError("MapMemory is not implemented.");
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user