Optimize Vulkan command buffer submission rate. (#49112)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49112

Differential Revision: D25729889

Test Plan: Imported from OSS

Reviewed By: SS-JIA

Pulled By: AshkanAliabadi

fbshipit-source-id: c4ab470fdcf3f83745971986f3a44a3dff69287f
This commit is contained in:
Ashkan Aliabadi 2021-01-08 16:36:28 -08:00 committed by Facebook GitHub Bot
parent aa18d17455
commit 1c12cbea90
30 changed files with 1060 additions and 961 deletions

View File

@ -207,7 +207,7 @@ cmake_dependent_option(
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
"LINUX" OFF)
option(USE_VULKAN "Use Vulkan GPU backend" OFF)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" OFF)
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF)
option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON)

View File

@ -62,6 +62,10 @@ class Cache final {
Factory factory_;
};
//
// Impl
//
template<typename Factory>
inline Cache<Factory>::Cache(Factory factory)
: factory_(std::move(factory)) {

View File

@ -76,6 +76,25 @@ Command::Buffer::Buffer(const VkCommandBuffer command_buffer)
"Invalid Vulkan command buffer!");
}
Command::Buffer::Buffer(Buffer&& buffer)
: command_buffer_(std::move(buffer.command_buffer_)),
bound_(std::move(buffer.bound_)),
barriers_(std::move(buffer.barriers_)) {
buffer.invalidate();
}
Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
if (&buffer != this) {
command_buffer_ = std::move(buffer.command_buffer_);
bound_ = std::move(buffer.bound_);
barriers_ = std::move(buffer.barriers_);
buffer.invalidate();
};
return *this;
}
void Command::Buffer::Buffer::begin() {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
command_buffer_,
@ -107,69 +126,6 @@ void Command::Buffer::Buffer::end() {
VK_CHECK(vkEndCommandBuffer(command_buffer_));
}
void Command::Buffer::barrier() {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
command_buffer_,
"This command buffer is in an invalid state! "
"Potential reason: This command buffer is moved from.");
if (barriers_.stage) {
c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
buffer_memory_barriers.push_back({
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
nullptr,
barrier.memory.src,
barrier.memory.dst,
VK_QUEUE_FAMILY_IGNORED,
VK_QUEUE_FAMILY_IGNORED,
barrier.object.handle,
barrier.object.offset,
barrier.object.range,
});
}
c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
for (const Resource::Image::Barrier& barrier : barriers_.images) {
image_memory_barriers.push_back({
VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
nullptr,
barrier.memory.src,
barrier.memory.dst,
barrier.layout.src,
barrier.layout.dst,
VK_QUEUE_FAMILY_IGNORED,
VK_QUEUE_FAMILY_IGNORED,
barrier.object.handle,
{
VK_IMAGE_ASPECT_COLOR_BIT,
0u,
VK_REMAINING_MIP_LEVELS,
0u,
VK_REMAINING_ARRAY_LAYERS,
},
});
}
vkCmdPipelineBarrier(
command_buffer_,
barriers_.stage.src,
barriers_.stage.dst,
0u,
0u,
nullptr,
buffer_memory_barriers.size(),
buffer_memory_barriers.data(),
image_memory_barriers.size(),
image_memory_barriers.data());
}
// Reset
barriers_.reset();
}
void Command::Buffer::barrier(const Pipeline::Barrier& barrier) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
command_buffer_,
@ -291,31 +247,86 @@ void Command::Buffer::dispatch(
bound_.pipeline.local_work_group.data[2u]));
}
void Command::Buffer::submit(
const VkQueue queue,
const Resource::Fence fence) {
void Command::Buffer::barrier() {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
command_buffer_,
"This command buffer is in an invalid state! "
"Potential reason: This command buffer is moved from.");
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
queue,
"Invalid Vulkan queue!");
if (barriers_.stage) {
c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
const VkSubmitInfo submit_info{
VK_STRUCTURE_TYPE_SUBMIT_INFO,
nullptr,
0u,
nullptr,
nullptr,
1u,
&command_buffer_,
0u,
nullptr,
};
for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
buffer_memory_barriers.push_back({
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
nullptr,
barrier.memory.src,
barrier.memory.dst,
VK_QUEUE_FAMILY_IGNORED,
VK_QUEUE_FAMILY_IGNORED,
barrier.object.handle,
barrier.object.offset,
barrier.object.range,
});
}
VK_CHECK(vkQueueSubmit(queue, 1u, &submit_info, fence.handle()));
c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
for (const Resource::Image::Barrier& barrier : barriers_.images) {
image_memory_barriers.push_back({
VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
nullptr,
barrier.memory.src,
barrier.memory.dst,
barrier.layout.src,
barrier.layout.dst,
VK_QUEUE_FAMILY_IGNORED,
VK_QUEUE_FAMILY_IGNORED,
barrier.object.handle,
{
VK_IMAGE_ASPECT_COLOR_BIT,
0u,
VK_REMAINING_MIP_LEVELS,
0u,
VK_REMAINING_ARRAY_LAYERS,
},
});
}
vkCmdPipelineBarrier(
command_buffer_,
barriers_.stage.src,
barriers_.stage.dst,
0u,
0u,
nullptr,
buffer_memory_barriers.size(),
buffer_memory_barriers.data(),
image_memory_barriers.size(),
image_memory_barriers.data());
}
// Reset
barriers_.reset();
}
void Command::Buffer::invalidate() {
command_buffer_ = VK_NULL_HANDLE;
}
inline void Command::Buffer::Bound::reset() {
pipeline = {};
descriptor_set = VK_NULL_HANDLE;
}
inline Command::Buffer::Barrier::Stage::operator bool() const {
return (0u != src) || (0u != dst);
}
inline void Command::Buffer::Barrier::reset() {
stage = {};
buffers.clear();
images.clear();
}
Command::Pool::Pool(const GPU& gpu)
@ -338,8 +349,9 @@ Command::Pool::Pool(const GPU& gpu)
Command::Pool::Pool(Pool&& pool)
: device_(std::move(pool.device_)),
command_pool_(std::move(pool.command_pool_)),
buffer_(std::move(pool.buffer_)) {
pool.device_ = VK_NULL_HANDLE;
buffer_(std::move(pool.buffer_)),
stream_(std::move(pool.stream_)) {
pool.invalidate();
}
Command::Pool& Command::Pool::operator=(Pool&& pool) {
@ -347,8 +359,9 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) {
device_ = std::move(pool.device_);
command_pool_ = std::move(pool.command_pool_);
buffer_ = std::move(pool.buffer_);
stream_ = std::move(pool.stream_);
pool.device_ = VK_NULL_HANDLE;
pool.invalidate();
};
return *this;
@ -383,25 +396,109 @@ Command::Buffer Command::Pool::allocate() {
Configuration::kQuantum);
allocate_command_buffers(
device_,
command_pool_.get(),
buffer_.pool.data() + buffer_.in_use,
Configuration::kQuantum);
device_,
command_pool_.get(),
buffer_.pool.data() + buffer_.in_use,
Configuration::kQuantum);
}
return Buffer(buffer_.pool[buffer_.in_use++]);
}
Command::Buffer& Command::Pool::stream() {
if (!stream_.buffer) {
stream_.buffer = allocate();
stream_.buffer.begin();
stream_.counter = 0u;
}
return stream_.buffer;
}
void Command::Pool::purge() {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
device_ && command_pool_,
"This command pool is in an invalid state! "
"Potential reason: This command pool is moved from.");
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
!stream_.buffer,
"Pending command buffer detected. Make sure all command buffers are "
"submitted to the queue for execution prior to reclaiming pool memory.");
buffer_.in_use = 0u;
VK_CHECK(vkResetCommandPool(device_, command_pool_.get(), 0u));
}
void Command::Pool::submit(
const VkQueue queue,
const c10::ArrayRef<const Buffer> buffers,
const Resource::Fence fence) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
device_ && command_pool_,
"This command pool is in an invalid state! "
"Potential reason: This command pool is moved from.");
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
queue,
"Invalid Vulkan queue!");
c10::SmallVector<VkCommandBuffer, Configuration::kReserve> command_buffers;
command_buffers.reserve(buffers.size());
for (const Buffer& buffer : buffers) {
VkCommandBuffer command_buffer = buffer.handle();
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
command_buffer,
"Invalid Vulkan command buffer!");
// Are we submitting our one and only command stream, or a regular command
// buffer whose scope is manually maintained by the user? Automatically
// maintain state and submission rate if the former.
if (stream_.buffer.handle() == command_buffer) {
// Hand the stream off to the driver if:
// - The user has implictly signaled interest in the results via a fence.
// - We are over the submission cutoff. We don't want to starve the GPU.
if (fence || (stream_.counter++ > Configuration::kSubmit)) {
stream_.buffer.end();
stream_.buffer.invalidate();
}
// Skip - Accumulate more calls prior to submission.
else {
command_buffer = VK_NULL_HANDLE;
}
}
if (command_buffer) {
command_buffers.push_back(command_buffer);
}
}
if (!command_buffers.empty()) {
const VkSubmitInfo submit_info{
VK_STRUCTURE_TYPE_SUBMIT_INFO,
nullptr,
0u,
nullptr,
nullptr,
command_buffers.size(),
command_buffers.data(),
0u,
nullptr,
};
VK_CHECK(vkQueueSubmit(queue, 1u, &submit_info, fence.handle()));
}
}
void Command::Pool::invalidate() {
device_ = VK_NULL_HANDLE;
command_pool_.reset();
}
} // namespace api
} // namespace vulkan
} // namespace native

View File

@ -7,6 +7,7 @@
#include <ATen/native/vulkan/api/Pipeline.h>
#include <ATen/native/vulkan/api/Resource.h>
#include <ATen/native/vulkan/api/Shader.h>
#include <c10/util/ArrayRef.h>
namespace at {
namespace native {
@ -14,13 +15,15 @@ namespace vulkan {
namespace api {
struct Command final {
class Pool;
//
// Buffer
//
class Buffer final {
public:
Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
explicit Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
Buffer(const Buffer&) = delete;
Buffer& operator=(const Buffer&) = delete;
Buffer(Buffer&&);
@ -28,18 +31,22 @@ struct Command final {
~Buffer() = default;
operator bool() const;
VkCommandBuffer handle() const;
void begin();
void end();
void barrier(const Pipeline::Barrier& barrier);
void bind(const Pipeline::Object& pipeline);
void bind(const Descriptor::Set& set);
void copy(Resource::Buffer::Object source, Resource::Buffer::Object destination);
void dispatch(const Shader::WorkGroup& global_work_group);
void submit(VkQueue queue, Resource::Fence fence = {});
private:
friend class Pool;
void barrier();
void invalidate();
private:
VkCommandBuffer command_buffer_;
@ -80,12 +87,22 @@ struct Command final {
~Pool();
Buffer allocate();
Buffer& stream();
void purge();
void submit(
VkQueue queue,
c10::ArrayRef<const Buffer> buffers,
Resource::Fence fence = {});
private:
void invalidate();
private:
struct Configuration final {
static constexpr uint32_t kQuantum = 64u;
static constexpr uint32_t kReserve = 1024u;
static constexpr uint32_t kQuantum = 4u;
static constexpr uint32_t kReserve = 16u;
static constexpr uint32_t kSubmit = 10u;
};
VkDevice device_;
@ -95,6 +112,11 @@ struct Command final {
std::vector<VkCommandBuffer> pool;
size_t in_use;
} buffer_;
struct {
Buffer buffer;
uint32_t counter;
} stream_;
} pool /* [thread_count] */;
explicit Command(const GPU& gpu)
@ -106,43 +128,12 @@ struct Command final {
// Impl
//
inline Command::Buffer::Buffer(Buffer&& buffer)
: command_buffer_(std::move(buffer.command_buffer_)),
bound_(std::move(buffer.bound_)),
barriers_(std::move(buffer.barriers_)) {
buffer.command_buffer_ = VK_NULL_HANDLE;
}
inline Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
if (&buffer != this) {
command_buffer_ = std::move(buffer.command_buffer_);
bound_ = std::move(buffer.bound_);
barriers_ = std::move(buffer.barriers_);
buffer.command_buffer_ = VK_NULL_HANDLE;
};
return *this;
}
inline Command::Buffer::operator bool() const {
return VK_NULL_HANDLE != command_buffer_;
}
inline void Command::Buffer::Bound::reset() {
pipeline = {};
descriptor_set = VK_NULL_HANDLE;
}
inline Command::Buffer::Barrier::Stage::operator bool() const {
return (0u != src) ||
(0u != dst);
}
inline void Command::Buffer::Barrier::reset() {
stage = {};
buffers.clear();
images.clear();
inline VkCommandBuffer Command::Buffer::handle() const {
return command_buffer_;
}
} // namespace api

View File

@ -6,10 +6,17 @@
#ifdef USE_VULKAN_SHADERC_RUNTIME
#include <ATen/native/vulkan/glsl.h>
#define VK_KERNEL(name) { name##_glsl, }
#define VK_KERNEL(name) \
::at::native::vulkan::api::Shader::Descriptor{ \
name##_glsl, \
}
#else
#include <ATen/native/vulkan/spv.h>
#define VK_KERNEL(name) { name##_spv, name##_spv_len, }
#define VK_KERNEL(name) \
::at::native::vulkan::api::Shader::Descriptor{ \
name##_spv, \
name##_spv_len, \
}
#endif /* USE_VULKAN_SHADERC_RUNTIME */
#ifdef USE_VULKAN_WRAPPER

View File

@ -43,6 +43,40 @@ VkDevice create_device(
&queue_priorities,
};
uint32_t device_extension_properties_count = 0;
VK_CHECK(vkEnumerateDeviceExtensionProperties(
physical_device,
nullptr,
&device_extension_properties_count,
nullptr));
std::vector<VkExtensionProperties> device_extension_properties(
device_extension_properties_count);
VK_CHECK(vkEnumerateDeviceExtensionProperties(
physical_device,
nullptr,
&device_extension_properties_count,
device_extension_properties.data()));
constexpr const char* const requested_device_extensions[]{
#ifdef VK_KHR_portability_subset
// https://vulkan.lunarg.com/doc/view/1.2.162.0/mac/1.2-extensions/vkspec.html#VUID-VkDeviceCreateInfo-pProperties-04451
VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
#endif
};
std::vector<const char*> enabled_device_extensions;
for (const auto& requested_device_extension : requested_device_extensions) {
for (const auto& extension : device_extension_properties) {
if (strcmp(requested_device_extension, extension.extensionName) == 0) {
enabled_device_extensions.push_back(requested_device_extension);
break;
}
}
}
const VkDeviceCreateInfo device_create_info{
VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
nullptr,
@ -51,7 +85,8 @@ VkDevice create_device(
&device_queue_create_info,
0u,
nullptr,
0u,
static_cast<uint32_t>(enabled_device_extensions.size()),
enabled_device_extensions.data(),
nullptr,
};

View File

@ -128,27 +128,25 @@ Descriptor::Set::Set(
"Invalid Vulkan descriptor set!");
}
void Descriptor::Set::update(const Item& item) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
device_ && descriptor_set_,
"This descriptor set is in an invalid state! "
"Potential reason: This descriptor set is moved from.");
Descriptor::Set::Set(Set&& set)
: device_(std::move(set.device_)),
descriptor_set_(std::move(set.descriptor_set_)),
shader_layout_signature_(std::move(set.shader_layout_signature_)),
bindings_(std::move(set.bindings_)) {
set.invalidate();
}
const auto items_itr = std::find_if(
bindings_.items.begin(),
bindings_.items.end(),
[binding = item.binding](const Item& other) {
return other.binding == binding;
});
Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
if (&set != this) {
device_ = std::move(set.device_);
descriptor_set_ = std::move(set.descriptor_set_);
shader_layout_signature_ = std::move(set.shader_layout_signature_);
bindings_ = std::move(set.bindings_);
if (bindings_.items.end() == items_itr) {
bindings_.items.emplace_back(item);
}
else {
*items_itr = item;
}
set.invalidate();
};
bindings_.dirty = true;
return *this;
}
Descriptor::Set& Descriptor::Set::bind(
@ -276,12 +274,39 @@ VkDescriptorSet Descriptor::Set::handle() const {
return descriptor_set_;
}
void Descriptor::Set::invalidate() {
device_ = VK_NULL_HANDLE;
descriptor_set_ = VK_NULL_HANDLE;
}
void Descriptor::Set::update(const Item& item) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
device_ && descriptor_set_,
"This descriptor set is in an invalid state! "
"Potential reason: This descriptor set is moved from.");
const auto items_itr = std::find_if(
bindings_.items.begin(),
bindings_.items.end(),
[binding = item.binding](const Item& other) {
return other.binding == binding;
});
if (bindings_.items.end() == items_itr) {
bindings_.items.emplace_back(item);
}
else {
*items_itr = item;
}
bindings_.dirty = true;
}
Descriptor::Pool::Pool(const GPU& gpu)
: device_(gpu.device),
descriptor_pool_(
create_descriptor_pool(gpu.device),
VK_DELETER(DescriptorPool)(device_)),
set_{} {
VK_DELETER(DescriptorPool)(device_)) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
device_,
"Invalid Vulkan device!");
@ -295,7 +320,7 @@ Descriptor::Pool::Pool(Pool&& pool)
: device_(std::move(pool.device_)),
descriptor_pool_(std::move(pool.descriptor_pool_)),
set_(std::move(pool.set_)) {
pool.device_ = VK_NULL_HANDLE;
pool.invalidate();
}
Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
@ -304,7 +329,7 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
descriptor_pool_ = std::move(pool.descriptor_pool_);
set_ = std::move(pool.set_);
pool.device_ = VK_NULL_HANDLE;
pool.invalidate();
};
return *this;
@ -371,8 +396,13 @@ void Descriptor::Pool::purge() {
"This descriptor pool is in an invalid state! "
"Potential reason: This descriptor pool is moved from.");
set_.layouts.clear();
VK_CHECK(vkResetDescriptorPool(device_, descriptor_pool_.get(), 0u));
set_.layouts.clear();
}
void Descriptor::Pool::invalidate() {
device_ = VK_NULL_HANDLE;
descriptor_pool_.reset();
}
} // namespace api

View File

@ -73,6 +73,9 @@ struct Descriptor final {
VkDescriptorSet handle() const;
private:
void invalidate();
private:
struct Item final {
uint32_t binding;
@ -113,6 +116,9 @@ struct Descriptor final {
Set allocate(const Shader::Layout::Object& shader_layout);
void purge();
private:
void invalidate();
private:
struct Configuration final {
static constexpr uint32_t kQuantum = 16u;
@ -137,33 +143,6 @@ struct Descriptor final {
}
};
//
// Impl
//
inline Descriptor::Set::Set(Set&& set)
: device_(std::move(set.device_)),
descriptor_set_(std::move(set.descriptor_set_)),
shader_layout_signature_(std::move(set.shader_layout_signature_)),
bindings_(std::move(set.bindings_)) {
set.device_ = VK_NULL_HANDLE;
set.descriptor_set_ = VK_NULL_HANDLE;
}
inline Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
if (&set != this) {
device_ = std::move(set.device_);
descriptor_set_ = std::move(set.descriptor_set_);
shader_layout_signature_ = std::move(set.shader_layout_signature_);
bindings_ = std::move(set.bindings_);
set.device_ = VK_NULL_HANDLE;
set.descriptor_set_ = VK_NULL_HANDLE;
};
return *this;
}
} // namespace api
} // namespace vulkan
} // namespace native

View File

@ -169,6 +169,10 @@ Pipeline::Cache::Cache(Factory factory)
: cache_(std::move(factory)) {
}
void Pipeline::Cache::purge() {
cache_.purge();
}
} // namespace api
} // namespace vulkan
} // namespace native

View File

@ -196,7 +196,11 @@ inline Pipeline::Barrier::operator bool() const {
inline bool operator==(
const Pipeline::Layout::Descriptor& _1,
const Pipeline::Layout::Descriptor& _2) {
return (_1.descriptor_set_layout == _2.descriptor_set_layout);
static_assert(
std::is_trivially_copyable<Pipeline::Layout::Descriptor>::value,
"This implementation is no longer valid!");
return (0 == memcmp(&_1, &_2, sizeof(Pipeline::Layout::Descriptor)));
}
inline size_t Pipeline::Layout::Factory::Hasher::operator()(
@ -207,9 +211,11 @@ inline size_t Pipeline::Layout::Factory::Hasher::operator()(
inline bool operator==(
const Pipeline::Descriptor& _1,
const Pipeline::Descriptor& _2) {
return (_1.pipeline_layout == _2.pipeline_layout) &&
(_1.shader_module == _2.shader_module) &&
(_1.local_work_group == _2.local_work_group);
static_assert(
std::is_trivially_copyable<Pipeline::Descriptor>::value,
"This implementation is no longer valid!");
return (0 == memcmp(&_1, &_2, sizeof(Pipeline::Descriptor)));
}
inline size_t Pipeline::Factory::Hasher::operator()(
@ -236,10 +242,6 @@ inline Pipeline::Object Pipeline::Cache::retrieve(
};
}
inline void Pipeline::Cache::purge() {
cache_.purge();
}
} // namespace api
} // namespace vulkan
} // namespace native

View File

@ -192,6 +192,11 @@ VkFence Resource::Fence::handle(const bool add_to_waitlist) const {
"Invalid Vulkan fence!");
const VkFence fence = pool->fence_.pool[id].get();
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
fence,
"Invalid Vulkan fence!");
if (add_to_waitlist) {
pool->fence_.waitlist.push_back(fence);
}
@ -360,14 +365,13 @@ Resource::Pool::Pool(
: device_(gpu.device),
allocator_(
create_allocator(
gpu.adapter->runtime->instance(),
gpu.adapter->handle,
device_),
gpu.adapter->runtime->instance(),
gpu.adapter->handle,
device_),
vmaDestroyAllocator),
memory_{
std::move(policy),
},
buffer_{},
image_{
.sampler = Image::Sampler{gpu},
},
@ -377,6 +381,31 @@ Resource::Pool::Pool(
fence_.pool.reserve(Configuration::kReserve);
}
Resource::Pool::Pool(Pool&& pool)
: device_(std::move(pool.device_)),
allocator_(std::move(pool.allocator_)),
memory_(std::move(pool.memory_)),
buffer_(std::move(pool.buffer_)),
image_(std::move(pool.image_)),
fence_(std::move(pool.fence_)) {
pool.invalidate();
}
Resource::Pool& Resource::Pool::operator=(Pool&& pool) {
if (&pool != this) {
device_ = std::move(pool.device_);
allocator_ = std::move(pool.allocator_);
memory_ = std::move(pool.memory_);
buffer_ = std::move(pool.buffer_);
image_ = std::move(pool.image_);
fence_ = std::move(pool.fence_);
pool.invalidate();
};
return *this;
}
Resource::Pool::~Pool() {
try {
if (device_ && allocator_) {
@ -394,31 +423,6 @@ Resource::Pool::~Pool() {
}
}
Resource::Pool::Pool(Pool&& pool)
: device_(std::move(pool.device_)),
allocator_(std::move(pool.allocator_)),
memory_(std::move(pool.memory_)),
buffer_(std::move(pool.buffer_)),
image_(std::move(pool.image_)),
fence_(std::move(pool.fence_)) {
pool.device_ = VK_NULL_HANDLE;
}
Resource::Pool& Resource::Pool::operator=(Pool&& pool) {
if (&pool != this) {
device_ = std::move(pool.device_);
allocator_ = std::move(pool.allocator_);
memory_ = std::move(pool.memory_);
buffer_ = std::move(pool.buffer_);
image_ = std::move(pool.image_);
fence_ = std::move(pool.fence_);
pool.device_ = VK_NULL_HANDLE;
};
return *this;
}
Resource::Buffer Resource::Pool::buffer(
const Buffer::Descriptor& descriptor) {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
@ -678,6 +682,11 @@ void Resource::Pool::purge() {
buffer_.pool.clear();
}
void Resource::Pool::invalidate() {
device_ = VK_NULL_HANDLE;
allocator_.reset();
}
} // namespace api
} // namespace vulkan
} // namespace native

View File

@ -20,15 +20,6 @@ struct Resource final {
//
struct Memory final {
/*
Barrier
*/
struct Barrier final {
VkAccessFlags src;
VkAccessFlags dst;
};
/*
Descriptor
*/
@ -39,8 +30,18 @@ struct Resource final {
VkMemoryPropertyFlags /* optional */ preferred;
};
VmaAllocator allocator;
VmaAllocation allocation;
/*
Barrier
*/
struct Barrier final {
VkAccessFlags src;
VkAccessFlags dst;
};
/*
Access
*/
struct Access final {
typedef uint8_t Flags;
@ -74,6 +75,9 @@ struct Resource final {
typename Pointer = Access::Pointer<Type, kAccess>>
Handle<Pointer> map() &;
VmaAllocator allocator;
VmaAllocation allocation;
private:
// Intentionally disabed to ensure memory access is always properly
// encapsualted in a scoped map-unmap region. Allowing below overloads
@ -299,6 +303,8 @@ struct Resource final {
private:
friend struct Fence;
void invalidate();
private:
struct Configuration final {
static constexpr uint32_t kReserve = 256u;
@ -353,7 +359,8 @@ class Resource::Memory::Scope final {
template<typename, typename Pointer>
inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
void* map(const Memory& memory, Access::Flags);
// Forward declaration
void* map(const Memory&, Access::Flags);
return Handle<Pointer>{
reinterpret_cast<Pointer>(map(*this, Access::Read)),
@ -363,7 +370,8 @@ inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
template<typename, Resource::Memory::Access::Flags kAccess, typename Pointer>
inline Resource::Memory::Handle<Pointer> Resource::Memory::map() & {
void* map(const Memory& memory, Access::Flags);
// Forward declaration
void* map(const Memory&, Access::Flags);
static_assert(
(kAccess == Access::Read) ||
@ -388,10 +396,11 @@ inline Resource::Buffer::operator bool() const {
inline bool operator==(
const Resource::Image::Sampler::Descriptor& _1,
const Resource::Image::Sampler::Descriptor& _2) {
return (_1.filter == _2.filter) &&
(_1.mipmap_mode == _2.mipmap_mode) &&
(_1.address_mode == _2.address_mode) &&
(_1.border == _2.border);
static_assert(
std::is_trivially_copyable<Resource::Image::Sampler::Descriptor>::value,
"This implementation is no longer valid!");
return (0 == memcmp(&_1, &_2, sizeof(Resource::Image::Sampler::Descriptor)));
}
inline size_t Resource::Image::Sampler::Factory::Hasher::operator()(

View File

@ -86,7 +86,9 @@ VkInstance create_instance(const Runtime::Type type) {
nullptr, &instance_extension_count, instance_extension_properties.data()));
constexpr const char* const requested_instance_extensions[]{
#ifdef VK_EXT_debug_report
VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
#endif
};
for (const auto& requested_instance_extension : requested_instance_extensions) {

View File

@ -33,10 +33,7 @@ class Runtime final {
Runtime& operator=(Runtime&&) = default;
~Runtime() = default;
inline VkInstance instance() const {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
return instance_.get();
}
VkInstance instance() const;
typedef std::function<bool (const Adapter&)> Selector;
Adapter select(const Selector& selector);
@ -59,6 +56,15 @@ class Runtime final {
Runtime* runtime();
//
// Impl
//
inline VkInstance Runtime::instance() const {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
return instance_.get();
}
} // namespace api
} // namespace vulkan
} // namespace native

View File

@ -60,6 +60,10 @@ Shader::Layout::Cache::Cache(Factory factory)
: cache_(std::move(factory)) {
}
void Shader::Layout::Cache::purge() {
cache_.purge();
}
#ifdef USE_VULKAN_SHADERC_RUNTIME
struct Shader::Factory::Compiler final {

View File

@ -218,16 +218,14 @@ inline Shader::Layout::Object Shader::Layout::Cache::retrieve(
};
}
inline void Shader::Layout::Cache::purge() {
cache_.purge();
}
inline bool operator==(
const Shader::WorkGroup& _1,
const Shader::WorkGroup& _2) {
return (_1.data[0u] == _2.data[0u]) &&
(_1.data[1u] == _2.data[1u]) &&
(_1.data[2u] == _2.data[2u]);
static_assert(
std::is_trivially_copyable<Shader::WorkGroup>::value,
"This implementation is no longer valid!");
return (0 == memcmp(&_1, &_2, sizeof(Shader::WorkGroup)));
}
inline Shader::Descriptor::Descriptor(const char* const glsl)
@ -258,12 +256,10 @@ inline bool operator==(
const Shader::Descriptor& _1,
const Shader::Descriptor& _2) {
static_assert(
sizeof(Shader::Descriptor::shader.source) == sizeof(Shader::Descriptor::shader.binary),
"This implementation requires sizeof(Source) to be equal to sizeof(Binary).");
std::is_trivially_copyable<Shader::Descriptor>::value,
"This implementation is no longer valid!");
return (_1.type == _2.type) &&
(_1.shader.binary.spirv == _2.shader.binary.spirv) &&
(_1.shader.binary.size == _2.shader.binary.size);
return (0 == memcmp(&_1, &_2, sizeof(Shader::Descriptor)));
}
inline size_t Shader::Factory::Hasher::operator()(
@ -286,11 +282,11 @@ inline size_t Shader::Factory::Hasher::operator()(
inline bool operator==(
const VkDescriptorSetLayoutBinding& _1,
const VkDescriptorSetLayoutBinding& _2) {
return (_1.binding == _2.binding) &&
(_1.descriptorType == _2.descriptorType) &&
(_1.descriptorCount == _2.descriptorCount) &&
(_1.stageFlags == _2.stageFlags) &&
(_1.pImmutableSamplers == _2.pImmutableSamplers);
static_assert(
std::is_trivially_copyable<VkDescriptorSetLayoutBinding>::value,
"This implementation is no longer valid!");
return (0 == memcmp(&_1, &_2, sizeof(VkDescriptorSetLayoutBinding)));
}
#endif /* USE_VULKAN_API */

View File

@ -24,11 +24,11 @@ Tensor add_scalar(
v_self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_output.has_image() && v_self.has_image()) {
const struct {
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
const struct Block final {
uvec3 extents;
float other;
} block {
@ -64,8 +64,7 @@ Tensor add_scalar(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
@ -82,11 +81,11 @@ Tensor& add_scalar_(
vTensor& v_self = convert(self);
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_self.has_image()) {
const struct {
if C10_LIKELY(v_self.has_image()) {
const struct Block final {
uvec3 extents;
float other;
} block {
@ -116,8 +115,7 @@ Tensor& add_scalar_(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return self;
}
@ -140,11 +138,11 @@ Tensor add_tensor(
v_self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_self.has_image() && v_other.has_image()) {
const struct {
if C10_LIKELY(v_self.has_image() && v_other.has_image()) {
const struct Block final {
uvec3 extents;
float alpha;
} block {
@ -186,8 +184,7 @@ Tensor add_tensor(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
@ -207,11 +204,11 @@ Tensor& add_tensor_(
const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan();
const vTensor& v_other = convert(other);
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
const struct {
if C10_LIKELY(v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
const struct Block final {
uvec3 extents;
float alpha;
} block {
@ -247,8 +244,7 @@ Tensor& add_tensor_(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return self;
}

View File

@ -28,11 +28,11 @@ Tensor clamp(
v_self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_output.has_image() && v_self.has_image()) {
const struct {
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
const struct Block final {
uvec3 extents;
uint32_t _;
vec2 clamp;
@ -73,8 +73,7 @@ Tensor clamp(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
@ -95,11 +94,11 @@ Tensor& clamp_(
vTensor& v_self = convert(self);
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_self.has_image()) {
const struct {
if C10_LIKELY(v_self.has_image()) {
const struct Block final {
uvec3 extents;
uint32_t _;
vec2 clamp;
@ -134,8 +133,7 @@ Tensor& clamp_(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return self;
}

View File

@ -35,14 +35,6 @@ struct Layout final {
};
};
struct Experimentation {
static constexpr bool kUseConv2dOldApi = false;
};
struct ConvPrepackLimits final {
static constexpr int64_t maxStackDepth = 2048*4;
};
} // namespace ops
} // namespace vulkan
} // namespace native

View File

@ -1,8 +1,8 @@
#include <ATen/native/vulkan/ops/Convolution.h>
#include <ATen/native/vulkan/api/Utils.h>
#include <ATen/native/ConvUtils.h>
#include <ATen/native/utils/ParamUtils.h>
#include <ATen/native/vulkan/ops/Persistent.h>
#include <ATen/native/vulkan/api/Utils.h>
namespace at {
namespace native {
@ -12,6 +12,10 @@ namespace {
using namespace api::utils;
struct Experimentation final {
static constexpr bool kUseConv2dOldApi = false;
};
inline bool is_depthwise(
const IntArrayRef filter,
const int64_t groups) {
@ -26,47 +30,103 @@ inline bool is_pointwise(const IntArrayRef filter) {
}
vTensor pack_weights_dw(
api::Context* const context,
api::Command::Buffer& command_buffer,
api::Resource::Pool& pool,
const Tensor& weight_arg,
const int64_t groups) {
if (weight_arg.is_vulkan()) {
return convert(weight_arg);
}
const Tensor& weight) {
/* Source */
const Tensor weight = weight_arg.contiguous();
const IntArrayRef src_filter = weight.sizes();
const float* const src_weight_ptr = weight.data_ptr<float>();
const int64_t src_kw_sz = src_filter[Layout::Filter::width];
const int64_t src_kh_sz = src_filter[Layout::Filter::height];
const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input];
const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
/* Destination */
const int64_t dst_kw_sz = src_kernel_sz;
const int64_t dst_kh_sz = num_stacks;
const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
vTensor v_weight{
api::context(),
context,
&pool,
{
4,
num_stacks,
src_kw_sz * src_kh_sz,
dst_kh_sz,
dst_kw_sz,
},
weight.options(),
};
using Future = vTensor::Future<float, vTensor::Access::Write>;
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
Future::Payload v_weight_payload = v_weight_future.wait();
float* const dst_weight_ptr = v_weight_payload.get();
memset(dst_weight_ptr, 0, v_weight.nbytes());
for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
/* Source */
const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
/* Destination */
const int64_t dst_oh = src_oc / 4;
const int64_t dst_c = src_oc % 4;
float* const dst_weight_c_ptr = dst_weight_ptr +
dst_c * dst_kernel_sz +
dst_oh * dst_kw_sz;
for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
memcpy(
dst_weight_c_ptr + src_ih * src_kw_sz,
src_weight_oc_ptr + src_ih * src_kw_sz,
sizeof(float) * src_kw_sz);
}
}
return v_weight;
}
vTensor pack_weights_2d(
api::Context* const context,
api::Command::Buffer& command_buffer,
api::Resource::Pool& pool,
const Tensor& weight) {
/* Source */
const IntArrayRef src_filter = weight.sizes();
const float* const src_weight_ptr = weight.data_ptr<float>();
const int64_t src_kw_sz = src_filter[Layout::Filter::width];
const int64_t src_kh_sz = src_filter[Layout::Filter::height];
const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
const int64_t src_block_sz =
src_kernel_sz * src_filter[Layout::Filter::input];
const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input];
const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
/* Destination */
const int64_t dst_kw_sz = src_kw_sz * src_kh_sz;
const int64_t dst_kh_sz = num_stacks;
const int64_t dst_kw_sz = src_kw_sz * stack_depth;
const int64_t dst_kh_sz = src_kh_sz * num_stacks;
const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
vTensor v_weight{
context,
&pool,
{
4,
dst_kh_sz,
dst_kw_sz,
},
weight.options(),
};
using Future = vTensor::Future<float, vTensor::Access::Write>;
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
Future::Payload v_weight_payload = v_weight_future.wait();
float* const dst_weight_ptr = v_weight_payload.get();
memset(dst_weight_ptr, 0, v_weight.nbytes());
@ -80,26 +140,29 @@ vTensor pack_weights_dw(
float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
memcpy(
dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz,
src_weight_oc_ptr + src_ih * src_kw_sz,
sizeof(float) * src_kw_sz);
for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
const int64_t dst_ic4 = src_ic / 4;
for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
memcpy(
dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
sizeof(float));
}
}
}
}
return v_weight;
}
vTensor pack_weights_old(
vTensor pack_weights_2d_old(
api::Context* const context,
api::Command::Buffer& command_buffer,
api::Resource::Pool& pool,
const Tensor& weight_arg,
const int64_t groups) {
if (weight_arg.is_vulkan()) {
return convert(weight_arg);
}
const Tensor weight = weight_arg.contiguous();
const Tensor& weight) {
const IntArrayRef src_filter = weight.sizes();
const float* const src_weight_ptr = weight.data_ptr<float>();
@ -111,7 +174,7 @@ vTensor pack_weights_old(
const uint32_t KW = src_filter[Layout::Filter::width];
vTensor v_weight{
api::context(),
context,
&pool,
{
1,
@ -123,13 +186,13 @@ vTensor pack_weights_old(
};
using Future = vTensor::Future<float, vTensor::Access::Write>;
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
Future::Payload v_weight_payload = v_weight_future.wait();
float* const dst_weight_ptr = v_weight_payload.get();
memset(dst_weight_ptr, 0, v_weight.nbytes());
const float* src = src_weight_ptr;
const float* const src = src_weight_ptr;
float* const dst = dst_weight_ptr;
{
@ -162,7 +225,7 @@ vTensor pack_weights_old(
dim0_ = dim0;
dim1_ = dim1;
dim2_ = dim2;
data_ = new float[dim0 * dim1 * dim2 * 4];
data_ = new float[dim0 * dim1 * dim2 * 4]; // TODO: memory leak
memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
}
@ -211,7 +274,7 @@ vTensor pack_weights_old(
return v_weight;
}
vTensor pack_weights_2d(
vTensor pack_weights(
api::Resource::Pool& pool,
const Tensor& weight_arg,
const int64_t groups) {
@ -219,81 +282,32 @@ vTensor pack_weights_2d(
return convert(weight_arg);
}
api::Context* const context = api::context();
api::Command::Buffer& command_buffer = context->command().pool.stream();
const Tensor weight = weight_arg.contiguous();
const IntArrayRef src_filter = weight.sizes();
const float* const src_weight_ptr = weight.data_ptr<float>();
const int64_t src_kw_sz = src_filter[Layout::Filter::width];
const int64_t src_kh_sz = src_filter[Layout::Filter::height];
const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
vTensor v_weight{
api::context(),
&pool,
{
4,
src_kh_sz * num_stacks,
src_kw_sz * stack_depth,
},
weight.options(),
};
using Future = vTensor::Future<float, vTensor::Access::Write>;
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
Future::Payload v_weight_payload = v_weight_future.wait();
/* Source */
const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
const int64_t src_block_sz =
src_kernel_sz * src_filter[Layout::Filter::input];
/* Destination */
const int64_t dst_kw_sz = src_kw_sz * stack_depth;
const int64_t dst_kh_sz = src_kh_sz * num_stacks;
const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
float* const dst_weight_ptr = v_weight_payload.get();
memset(dst_weight_ptr, 0, v_weight.nbytes());
for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
/* Source */
const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
/* Destination */
const int64_t dst_oh = src_oc / 4;
const int64_t dst_c = src_oc % 4;
float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
const int64_t dst_ic4 = src_ic/4;
for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
memcpy(
dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
sizeof(float));
}
}
}
}
return v_weight;
}
vTensor pack_weights(
api::Resource::Pool& pool,
const Tensor& weight_arg,
const int64_t groups) {
if (is_depthwise(weight_arg.sizes(), groups)) {
return pack_weights_dw(pool, weight_arg, groups);
if (is_depthwise(weight.sizes(), groups)) {
return pack_weights_dw(
context,
command_buffer,
pool,
weight);
}
if (Experimentation::kUseConv2dOldApi) {
return pack_weights_old(pool, weight_arg, groups);
return pack_weights_2d_old(
context,
command_buffer,
pool,
weight);
}
return pack_weights_2d(pool, weight_arg, groups);
return pack_weights_2d(
context,
command_buffer,
pool,
weight);
}
vTensor pack_biases(
@ -304,8 +318,11 @@ vTensor pack_biases(
return convert(*bias);
}
api::Context* const context = api::context();
api::Command::Buffer& command_buffer = context->command().pool.stream();
vTensor v_bias{
api::context(),
context,
&pool,
{
// 1D
@ -316,7 +333,7 @@ vTensor pack_biases(
{
using Future = vTensor::Future<void, vTensor::Access::Write>;
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>(command_buffer);
Future::Payload v_bias_payload = v_bias_future.wait();
if (bias) {
@ -394,7 +411,8 @@ bool available(
(c10::DeviceType::Vulkan == bias->device().type())) &&
(kFloat == bias->scalar_type()) &&
(transposed ? false /* to be addded in the future */
: (weight.size(Layout::Filter::output) == bias->size(Layout::Filter::output))))
: (weight.size(Layout::Filter::output) ==
bias->size(Layout::Filter::output))))
: true) &&
// Stride
(stride[Layout::Parameter::height] > 0) &&
@ -432,7 +450,7 @@ bool usable(const Tensor& input) {
true;
}
void conv2d_depthwise(
void conv2d_dw(
api::Context* const context,
api::Command::Buffer& command_buffer,
vTensor& v_output,
@ -446,27 +464,39 @@ void conv2d_depthwise(
const IntArrayRef dilation,
const float output_min,
const float output_max) {
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const struct {
int32_t kernel_x, kernel_y;
int32_t stride_x, stride_y;
int32_t padding_x, padding_y;
int32_t dilate_x, dilate_y;
float clamp_x, clamp_y;
int32_t src_filter_w, src_filter_h;
if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const struct Block final {
ivec2 kernel;
ivec2 stride;
ivec2 padding;
ivec2 dilate;
vec2 clamp;
ivec2 src_filter;
} block {
safe_downcast<int32_t>(filter[Layout::Filter::width]),
safe_downcast<int32_t>(filter[Layout::Filter::height]),
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
output_min,
output_max,
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
{
safe_downcast<int32_t>(filter[Layout::Filter::width]),
safe_downcast<int32_t>(filter[Layout::Filter::height]),
},
{
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
},
{
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
},
{
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
},
{
output_min,
output_max,
},
{
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
},
};
context->dispatch(
@ -510,7 +540,7 @@ void conv2d_depthwise(
}
}
void conv2d_pointwise(
void conv2d_pw(
api::Context* const context,
api::Command::Buffer& command_buffer,
vTensor& v_output,
@ -522,22 +552,29 @@ void conv2d_pointwise(
const IntArrayRef padding,
const float output_min,
const float output_max) {
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const struct {
int32_t kernel_ic, kernel_oc;
int32_t stride_x, stride_y;
int32_t padding_x, padding_y;
float clamp_x, clamp_y;
if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const struct Block final {
ivec2 kernel;
ivec2 stride;
ivec2 padding;
vec2 clamp;
} block {
safe_downcast<int32_t>(filter[Layout::Filter::input]),
safe_downcast<int32_t>(filter[Layout::Filter::output]),
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
output_min,
output_max,
{
safe_downcast<int32_t>(filter[Layout::Filter::input]),
safe_downcast<int32_t>(filter[Layout::Filter::output]),
},
{
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
},
{
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
},
{
output_min,
output_max,
},
};
context->dispatch(
@ -595,30 +632,43 @@ void conv2d(
const IntArrayRef dilation,
const float output_min,
const float output_max) {
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const struct {
int32_t kernel_x, kernel_y, kernel_ic, kernel_oc;
int32_t stride_x, stride_y;
int32_t padding_x, padding_y;
int32_t dilate_x, dilate_y;
float clamp_x, clamp_y;
int32_t src_filter_w, src_filter_h, src_filter_w4;
if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const struct Block final {
ivec4 kernel;
ivec2 stride;
ivec2 padding;
ivec2 dilate;
vec2 clamp;
ivec4 src_filter;
} block {
safe_downcast<int32_t>(filter[Layout::Filter::width]),
safe_downcast<int32_t>(filter[Layout::Filter::height]),
safe_downcast<int32_t>(filter[Layout::Filter::input]),
safe_downcast<int32_t>(filter[Layout::Filter::output]),
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
output_min,
output_max,
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
safe_downcast<int32_t>(src_filter[Layout::Filter::width]*4),
{
safe_downcast<int32_t>(filter[Layout::Filter::width]),
safe_downcast<int32_t>(filter[Layout::Filter::height]),
safe_downcast<int32_t>(filter[Layout::Filter::input]),
safe_downcast<int32_t>(filter[Layout::Filter::output]),
},
{
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
},
{
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
},
{
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
},
{
output_min,
output_max,
},
{
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
safe_downcast<int32_t>(src_filter[Layout::Filter::width] * 4),
0,
},
};
context->dispatch(
@ -662,6 +712,98 @@ void conv2d(
}
}
void conv2d_old(
api::Context* const context,
api::Command::Buffer& command_buffer,
vTensor& v_output,
const vTensor& v_input,
const vTensor& v_weight,
const vTensor& v_bias,
const IntArrayRef filter,
const IntArrayRef stride,
const IntArrayRef padding,
const IntArrayRef dilation,
const float output_min,
const float output_max) {
using namespace api::utils;
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const int32_t W = v_input.extents().data[0];
const int32_t H = v_input.extents().data[1];
const int32_t C_4 = v_input.extents().data[2];
const int32_t C = 4 * C_4;
const int32_t OW = v_output.extents().data[0];
const int32_t OH = v_output.extents().data[1];
const int32_t OC_4 = v_output.extents().data[2];
const int32_t OC = 4 * OC_4;
const struct Block final {
int32_t padding_x, padding_y;
int32_t kernel_x, kernel_y;
int32_t stride_x, stride_y;
int32_t dilate_x, dilate_y;
int32_t outputSize[4];
int32_t inputSize[4];
float outputMin;
float outputMax;
} block {
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
safe_downcast<int32_t>(filter[Layout::Filter::width]),
safe_downcast<int32_t>(filter[Layout::Filter::height]),
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
{ OW, OH, OC_4, OC },
{ W, H, C_4, C },
output_min,
output_max,
};
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(conv2d_nogroup_clamp),
//VK_KERNEL(conv2d_nogroup_clamp_1x),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_input.image(
command_buffer,
vTensor::Stage::Compute),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_weight.image(
command_buffer,
vTensor::Stage::Compute),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_bias.buffer(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
Tensor convolution(
const Tensor& input,
const Tensor& weight,
@ -781,99 +923,6 @@ Conv2dOpContext Conv2dOpContext::create(
};
}
void conv2d_old(
api::Context* const context,
api::Command::Buffer& command_buffer,
vTensor& v_output,
const vTensor& v_input,
const vTensor& v_weight,
const vTensor& v_bias,
const IntArrayRef filter,
const IntArrayRef stride,
const IntArrayRef padding,
const IntArrayRef dilation,
const float output_min,
const float output_max) {
using namespace api::utils;
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
const int32_t W = v_input.extents().data[0];
const int32_t H = v_input.extents().data[1];
const int32_t C_4 = v_input.extents().data[2];
const int32_t C = 4 * C_4;
const int32_t OW = v_output.extents().data[0];
const int32_t OH = v_output.extents().data[1];
const int32_t OC_4 = v_output.extents().data[2];
const int32_t OC = 4 * OC_4;
const struct {
int32_t padding_x, padding_y;
int32_t kernel_x, kernel_y;
int32_t stride_x, stride_y;
int32_t dilate_x, dilate_y;
int32_t outputSize[4];
int32_t inputSize[4];
float outputMin;
float outputMax;
} block {
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
safe_downcast<int32_t>(filter[Layout::Filter::width]),
safe_downcast<int32_t>(filter[Layout::Filter::height]),
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
{ OW, OH, OC_4, OC },
{ W, H, C_4, C },
output_min,
output_max,
};
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(conv2d_nogroup_clamp),
//VK_KERNEL(conv2d_nogroup_clamp_1x),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_input.image(
command_buffer,
vTensor::Stage::Compute),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_weight.image(
command_buffer,
vTensor::Stage::Compute),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_bias.buffer(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
api::Context* const context = api::context();
@ -896,11 +945,11 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
input.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (is_depthwise(unpacked_.filter, unpacked_.groups)) {
conv2d_depthwise(
conv2d_dw(
context,
command_buffer,
v_output,
@ -932,7 +981,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
packed_.output_max);
} else {
if (is_pointwise(unpacked_.filter)) {
conv2d_pointwise(
conv2d_pw(
context,
command_buffer,
v_output,
@ -964,8 +1013,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
}
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}

View File

@ -6,87 +6,96 @@ namespace vulkan {
namespace ops {
Tensor& copy_(Tensor& self, const Tensor& src) {
// X -> Vulkan
if (at::kVulkan == self.device().type()) {
vTensor& v_self = convert(self);
api::Context* const context = api::context();
// CPU -> Vulkan
if (at::kCPU == src.device().type()) {
// Requesting write-only host access to the tensor never triggers a sync
// as the contents will be overwritten regardless. Having said that,
// appropriate barriers are inserted automatically if WAR or WAW hazards
// are detected. Examples of such scenario for instance are if any of
// these async operations are on going in the background on 'self':
// - On discrete systems:
// * buffer-to-staging transfers
// * staging-to-buffer transfers
// - On UMA buffer is an alias for staging and accessible both on host
// and device. Consequently:
// * buffer-to-image NHWC -> NC4HW packing
// * image-to-buffer NC4HW -> NHWC unpacking
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
// X -> Vulkan
if (at::kVulkan == self.device().type()) {
vTensor& v_self = convert(self);
using Future = vTensor::Future<void, vTensor::Access::Write>;
Future v_self_future = v_self.host<void, vTensor::Access::Write>();
// Vulkan -> Vulkan
if (at::kVulkan == src.device().type()) {
command_buffer.copy(
// - Read-only access is implied on const tensors. Memory barriers
// are automatically inserted if a RAW hazard is detected.
// - Recording any potential pending sync operations into the same
// command buffer prevents an expensive queue submission.
convert(src).buffer(
command_buffer,
vTensor::Stage::Transfer),
// - Write-only access never triggers a sync as the contents will be
// overwritten regardless. Having said that, appropriate barriers
// are inserted automatically if WAR or WAW hazards are detected.
// - Recording pending sync operations into the same command buffer
// prevents an expensive queue submission.
v_self.buffer(
command_buffer,
vTensor::Stage::Transfer,
vTensor::Access::Write));
// This wait() will be a no-op if no hazards are detected, including the
// obvious, yet important, special case of 'self' being an empty tensor.
command_pool.submit(context->gpu().queue, command_buffer);
}
// CPU -> Vulkan
else {
const Tensor cpu_src = src.device().is_cpu() ? src : src.cpu();
Future::Payload v_self_payload = v_self_future.wait();
// Requesting write-only host access to the tensor never triggers a sync
// as the contents will be overwritten regardless. Having said that,
// appropriate barriers are inserted automatically if WAR or WAW hazards
// are detected. Examples of such scenario for instance are if any of
// these async operations are on going in the background on 'self':
// - On discrete systems:
// * buffer-to-staging transfers
// * staging-to-buffer transfers
// - On UMA buffer is an alias for staging and accessible both on host
// and device. Consequently:
// * buffer-to-image NHWC -> NC4HW packing
// * image-to-buffer NC4HW -> NHWC unpacking
memcpy(
v_self_payload.get(),
src.contiguous().data_ptr<float>(),
std::min(src.nbytes(), self.nbytes()));
using Future = vTensor::Future<void, vTensor::Access::Write>;
Future v_self_future = v_self.host<void, vTensor::Access::Write>(command_buffer);
// Ideally we would have been able to put as much distance between
// requesting the data - a call to host() - and accessing the data
// - a call to wait() - but a local view of the computation graph
// in eager mode makes that optimization non-trivial.
// This wait() will be a no-op if no hazards are detected, including the
// obvious, yet important, special case of 'self' being an empty tensor.
Future::Payload v_self_payload = v_self_future.wait();
memcpy(
v_self_payload.get(),
cpu_src.contiguous().data_ptr<float>(),
std::min(src.nbytes(), self.nbytes()));
}
}
// Vulkan -> Vulkan
// Vulkan -> X
else if (at::kVulkan == src.device().type()) {
api::Command::Buffer command_buffer = api::context()->command().pool.allocate();
command_buffer.begin();
command_buffer.copy(
// - Read-only access is implied on const tensors. Memory barriers
// are automatically inserted if a RAW hazard is detected.
// - Recording any potential pending sync operations into the same
// command buffer prevents an expensive queue submission.
convert(src).buffer(
command_buffer,
vTensor::Stage::Transfer),
// - Write-only access never triggers a sync as the contents will be
// overwritten regardless. Having said that, appropriate barriers
// are inserted automatically if WAR or WAW hazards are detected.
// - Recording pending sync operations into the same command buffer
// prevents an expensive queue submission.
v_self.buffer(
command_buffer,
vTensor::Stage::Transfer,
vTensor::Access::Write));
command_buffer.end();
command_buffer.submit(api::context()->gpu().queue);
}
else {
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
}
}
// Vulkan -> X
else if (at::kVulkan == src.device().type()) {
const vTensor& v_src = convert(src);
{
// Similar notes as above applies, with the additional consideration of
// potential syncs on read accesses. Namely,
// - on discrete systems, if the (staging, buffer, image) trio, or
// - on UMA, if the (buffer, image) duo
// have gone out of sync as a result of one processor writing to one
// resource which is then either accessed as an another resource type on
// the same or another processor. Same considerations regarding hazard
// avoidance as above applies.
using Future = vTensor::Future<const void, vTensor::Access::Read>;
const Future v_src_future = v_src.host<const void>();
const vTensor& v_src = convert(src);
// Vulkan -> CPU
if (at::kCPU == self.device().type()) {
if (self.device().is_cpu()) {
// Similar notes as above applies, with the additional consideration of
// potential syncs on read accesses. Namely,
// - on discrete systems, if the (staging, buffer, image) trio, or
// - on UMA, if the (buffer, image) duo
// have gone out of sync as a result of one processor writing to one
// resource which is then either accessed as an another resource type on
// the same or another processor. Same considerations regarding hazard
// avoidance as above applies.
using Future = vTensor::Future<const void, vTensor::Access::Read>;
const Future v_src_future = v_src.host<const void>(command_buffer);
// Ideally we would have been able to put as much distance between
// requesting the data - a call to host() - and accessing the data
// - a call to wait() - but a local view of the computation graph
// in eager mode makes that optimization non-trivial.
// This wait() is a no-op if data is not out of sync. More often than
// not though, waits here are expected as the GPU catches up with
// compute submitted from CPU.
@ -99,51 +108,56 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
std::min(src.nbytes(), self.nbytes()));
}
else {
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
TORCH_CHECK(false, "Unsupported!");
}
//
// WARNING
//
// This is not great. We almost never want to flush the GPU pipeline as
// that has far reaching consequences, especially if PyTorch is not the only
// process accessing the GPU. If we have done our job properly, above
// synchronization mechanisms should be enough to ensure correctness at a more
// modest cost, as there is no need to flush the entirety of jobs in flight
// if one is only interested on waiting on computation affecting one single
// tensor to finish.
//
// Having said that, we still do need to release all pool resources at one
// point per inference run or we will run out of memory otherwise. There is
// no perfect answer to this problem that checks all boxes, which leaves us
// with one of several design decisions:
//
// 1) Use graph mode to gain an understanding of the computation graph,
// itself allowing us to place pool purges intelligently. Best option
// for performance and memory consumption. Not without its downsides if
// flexibility is a top priority.
// 2) If on eager mode, and hence are seeing operations one at a time, expose
// this release of resources to the user as a Python / C++ function. This
// makes for suboptimal user experience but is efficient in terms of
// performance.
// 3) If on eager mode, and interested in keeping this bookkeeping transparent
// to the user, release all resources somewhere ... like here. This is
// not ideal since it requires a pipeline flush to make sure these objects
// are not already in use by a workload in flight. Cannot do much better
// within the constraints of this approach. Good for user experience,
// suboptimal for performance.
// 4) If on eager mode, and interested in keeping this bookkeeping transparent
// to the user, and performance does not matter, make CPU and GPU run in
// lockstep. Obviously this is just bad. Mentioned for the sake of
// completeness.
context->flush();
}
else {
TORCH_INTERNAL_ASSERT(
false,
"Invalid code path taken! Either the source or the destination tensor "
"was expected to be Vulkan a tensor! Incorrect dispatch?");
}
//
// WARNING
//
// This is not great. We almost never want to flush the GPU pipeline as
// that has far reaching consequences, especially if PyTorch is not the only
// process accessing the GPU. If we have done our job properly, above
// synchronization mechanisms should be enough to ensure correctness at a more
// modest cost, as there is no need to flush the entirety of jobs in flight
// if one is only interested on waiting on computation affecting one single
// tensor to finish.
//
// Having said that, we still do need to release all pool resources at one
// point per inference run or we will run out of memory otherwise. There is
// no perfect answer to this problem that checks all boxes, which leaves us
// with one of several design decisions:
//
// 1) Use graph mode to gain an understanding of the computation graph,
// itself allowing us to place pool purges intelligently. Best option
// for performance and memory consumption. Not without its downsides if
// flexibility is a top priority.
// 2) If on eager mode, and hence are seeing operations one at a time, expose
// this release of resources to the user as a Python / C++ function. This
// makes for suboptimal user experience but is efficient in terms of
// performance.
// 3) If on eager mode, and interested in keeping this bookkeeping transparent
// to the user, release all resources somewhere ... like here. This is
// not ideal since it requires a pipeline flush to make sure these objects
// are not already in use by a workload in flight. Cannot do much better
// within the constraints of this approach. Good for user experience,
// suboptimal for performance.
// 4) If on eager mode, and interested in keeping this bookkeeping transparent
// to the user, and performance does not matter, make CPU and GPU run in
// lockstep. Obviously this is just bad. Mentioned for the sake of
// completeness.
api::context()->flush();
}
else {
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
}
// No queue submission here. All queue submissions must have been handled
// above either explicitly or as a result of calling tensor.host().
return self;
}

View File

@ -52,11 +52,11 @@ Tensor mean(
v_input.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_input.has_image()) {
const struct {
if C10_LIKELY(v_input.has_image()) {
const struct Block final {
uvec3 extents;
int32_t range;
ivec2 iextents;
@ -71,63 +71,35 @@ Tensor mean(
},
};
if (keepdim) {
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(mean),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_input.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
VK_KERNEL(mean2d),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_input.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
context->dispatch(
command_buffer,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
},
keepdim ? VK_KERNEL(mean) : VK_KERNEL(mean2d),
v_output.extents(),
// Write-only access bypasses synchronization but inserts appropriate
// barriers if necessary.
v_output.image(
command_buffer,
vTensor::Stage::Compute,
vTensor::Access::Write),
// Read-only access is implied on const tensors and triggers an async
// synchronization if necessary.
v_input.image(
command_buffer,
vTensor::Stage::Compute),
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}

View File

@ -10,18 +10,21 @@ namespace {
using namespace api::utils;
vTensor pack_weights(
api::Resource::Pool& pool,
const Tensor& weight_arg) {
api::Resource::Pool& pool,
const Tensor& weight_arg) {
if (weight_arg.is_vulkan()) {
return convert(weight_arg);
}
api::Context* const context = api::context();
api::Command::Buffer& command_buffer = context->command().pool.stream();
const Tensor weight = weight_arg.contiguous();
const IntArrayRef w_sizes = weight.sizes();
const float* const src_weight_ptr = weight.data_ptr<float>();
vTensor v_weight{
api::context(),
context,
&pool,
w_sizes,
weight.options(),
@ -29,7 +32,7 @@ vTensor pack_weights(
{
using Future = vTensor::Future<void, vTensor::Access::Write>;
Future v_weight_future = v_weight.host<void, vTensor::Access::Write>();
Future v_weight_future = v_weight.host<void, vTensor::Access::Write>(command_buffer);
Future::Payload v_weight_payload = v_weight_future.wait();
memcpy(
@ -49,16 +52,21 @@ vTensor pack_biases(
return convert(*bias_arg);
}
api::Context* const context = api::context();
api::Command::Buffer& command_buffer = context->command().pool.stream();
vTensor v_bias{
api::context(),
context,
&pool,
{weight_arg.sizes()[Layout::Parameter::width]},
{
weight_arg.size(Layout::Parameter::width),
},
weight_arg.options(),
};
{
using Future = vTensor::Future<void, vTensor::Access::Write>;
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>(command_buffer);
Future::Payload v_bias_payload = v_bias_future.wait();
if (bias_arg) {
@ -66,7 +74,8 @@ vTensor pack_biases(
v_bias_payload.get(),
bias_arg->contiguous().data_ptr<float>(),
std::min(bias_arg->nbytes(), v_bias.nbytes()));
} else {
}
else {
memset(
v_bias_payload.get(),
// 2's complement integers and IEEE-754 floating point numbers both
@ -162,11 +171,11 @@ Tensor mm(
mat1.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_mat1.has_image() && v_mat2.has_image()) {
const struct {
if C10_LIKELY(v_mat1.has_image() && v_mat2.has_image()) {
const struct Block final {
uvec3 size;
int32_t K;
} block {
@ -203,12 +212,12 @@ Tensor mm(
// Object lifetime is managed by the resource pool.
// It is OK not to keep track of the handle.
context->resource().pool.uniform(block).object);
} else {
}
else {
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
@ -281,14 +290,15 @@ Tensor LinearOpContext::run(
input.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_output.has_image() &&
if C10_LIKELY(
v_output.has_image() &&
v_input.has_image() &&
packed_.v_weight.has_image() &&
packed_.v_bias.has_image()) {
const struct {
const struct Block final {
uvec3 size;
int32_t K;
vec2 multiplier;
@ -341,8 +351,7 @@ Tensor LinearOpContext::run(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}

View File

@ -23,11 +23,11 @@ Tensor mul_scalar(
v_self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_output.has_image() && v_self.has_image()) {
const struct {
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
const struct Block final {
uvec3 extents;
float other;
} block {
@ -63,8 +63,7 @@ Tensor mul_scalar(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
@ -80,11 +79,11 @@ Tensor& mul_scalar_(
vTensor& v_self = convert(self);
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_self.has_image()) {
const struct {
if C10_LIKELY(v_self.has_image()) {
const struct Block final {
uvec3 extents;
float other;
} block {
@ -114,8 +113,7 @@ Tensor& mul_scalar_(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return self;
}

View File

@ -33,10 +33,10 @@ Tensor adaptive_avg_pool2d(
v_self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_self.has_image()) {
if C10_LIKELY(v_self.has_image()) {
const uvec3 v_output_size = v_output.extents();
const uvec3 v_self_size = v_self.extents();
@ -45,7 +45,7 @@ Tensor adaptive_avg_pool2d(
static_cast<float>(v_self_size.data[1u]) / v_output_size.data[1u],
};
const struct {
const struct Block final {
uvec3 size;
uint32_t _;
vec2 stride;
@ -88,8 +88,7 @@ Tensor adaptive_avg_pool2d(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}
@ -171,13 +170,11 @@ Tensor avg_pool2d(
v_self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
using namespace utils;
if (v_self.has_image()) {
const struct {
if C10_LIKELY(v_self.has_image()) {
const struct Block final {
uvec3 extents;
int32_t range;
ivec2 iextents;
@ -235,8 +232,7 @@ Tensor avg_pool2d(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}

View File

@ -21,8 +21,8 @@ Tensor view(
self.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
command_buffer.copy(
// Read-only access is implied on const tensors and triggers an async
@ -37,8 +37,7 @@ Tensor view(
vTensor::Stage::Transfer,
vTensor::Access::Write));
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}

View File

@ -419,31 +419,19 @@ vTensor::vTensor(
}) {
}
const vTensor* vTensor::host() const {
view_->staging(Stage::Host, Access::Read);
const vTensor* vTensor::host(
api::Command::Buffer& command_buffer) const {
view_->staging(command_buffer, Stage::Host, Access::Read);
return this;
}
vTensor* vTensor::host(const Access::Flags access) {
view_->staging(Stage::Host, access);
vTensor* vTensor::host(
api::Command::Buffer& command_buffer,
const Access::Flags access) {
view_->staging(command_buffer, Stage::Host, access);
return this;
}
vTensor::Buffer::Object vTensor::buffer(
const Stage::Flags stage) const & {
return view_->buffer(
stage,
Access::Read).object;
}
vTensor::Buffer::Object vTensor::buffer(
const Stage::Flags stage,
const Access::Flags access) & {
return view_->buffer(
stage,
access).object;
}
vTensor::Buffer::Object vTensor::buffer(
api::Command::Buffer& command_buffer,
const Stage::Flags stage) const & {
@ -463,21 +451,6 @@ vTensor::Buffer::Object vTensor::buffer(
access).object;
}
vTensor::Image::Object vTensor::image(
const Stage::Flags stage) const & {
return view_->image(
stage,
Access::Read).object;
}
vTensor::Image::Object vTensor::image(
const Stage::Flags stage,
const Access::Flags access) & {
return view_->image(
stage,
access).object;
}
vTensor::Image::Object vTensor::image(
api::Command::Buffer& command_buffer,
const Stage::Flags stage) const & {
@ -535,16 +508,8 @@ vTensor::View::View(
ops::verify(options);
}
// We typically do not know whether we need a command buffer to service a request
// until we have perfomed a bunch of checks in nested logic, and even then we
// may end up with the always issued state transition optimized away under
// certain conditions, which makes a policy of always allocating a command buffer
// up front, only to end up using it at times, a wasteful approach. This class
// answers that need.
class vTensor::View::CMD final {
public:
explicit CMD(const View&);
CMD(const View&, api::Command::Buffer&);
CMD(const CMD&) = delete;
CMD& operator=(const CMD&) = delete;
@ -578,60 +543,18 @@ class vTensor::View::CMD final {
const Image::Object& image,
Buffer::Object& buffer);
void submit(Fence fence = {});
private:
api::Command::Buffer& command_buffer();
void submit(Fence fence);
private:
const View& view_;
enum class Type {
Internal,
External,
} type;
union _ final {
api::Command::Buffer internal;
api::Command::Buffer* external;
~_() {}
} command_buffer_;
api::Command::Buffer& command_buffer_;
};
vTensor::View::CMD::CMD(
const View& view)
: view_(view),
type(Type::Internal),
command_buffer_{} {
}
vTensor::View::CMD::CMD(
const View& view,
api::Command::Buffer& external)
api::Command::Buffer& command_buffer)
: view_(view),
type(Type::External),
command_buffer_{
.external = &external,
} {
}
api::Command::Buffer& vTensor::View::CMD::command_buffer() {
switch (type) {
case Type::Internal:
if (!command_buffer_.internal) {
command_buffer_.internal = view_.context_->command().pool.allocate();
command_buffer_.internal.begin();
}
return command_buffer_.internal;
case Type::External:
return *(command_buffer_.external);
default:
TORCH_INTERNAL_ASSERT(false, "Unknown command buffer type!");
break;
}
command_buffer_(command_buffer) {
}
void vTensor::View::CMD::barrier(State::Transition transition) {
@ -761,7 +684,7 @@ void vTensor::View::CMD::barrier(State::Transition transition) {
barrier.stage.src = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
}
command_buffer().barrier(barrier);
command_buffer_.barrier(barrier);
}
}
@ -789,7 +712,7 @@ void vTensor::View::CMD::copy_buffer_to_staging(
{},
}));
command_buffer().copy(buffer, staging);
command_buffer_.copy(buffer, staging);
}
void vTensor::View::CMD::copy_staging_to_buffer(
@ -816,7 +739,7 @@ void vTensor::View::CMD::copy_staging_to_buffer(
{},
}));
command_buffer().copy(staging, buffer);
command_buffer_.copy(staging, buffer);
}
void vTensor::View::CMD::copy_buffer_to_image(
@ -847,7 +770,7 @@ void vTensor::View::CMD::copy_buffer_to_image(
const uvec3 extents = view_.extents();
const uint32_t plane = extents.data[0u] * extents.data[1u];
const struct {
const struct Block final {
uvec3 extents;
uint32_t block;
uvec4 offset;
@ -863,7 +786,7 @@ void vTensor::View::CMD::copy_buffer_to_image(
};
view_.context_->dispatch(
command_buffer(),
command_buffer_,
{
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@ -904,7 +827,7 @@ void vTensor::View::CMD::copy_image_to_buffer(
const uvec3 extents = view_.extents();
const uint32_t plane = extents.data[0u] * extents.data[1u];
const struct {
const struct Block final {
uvec3 extents;
uint32_t block;
uvec4 offset;
@ -920,7 +843,7 @@ void vTensor::View::CMD::copy_image_to_buffer(
};
view_.context_->dispatch(
command_buffer(),
command_buffer_,
{
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@ -934,10 +857,10 @@ void vTensor::View::CMD::copy_image_to_buffer(
}
void vTensor::View::CMD::submit(const api::Resource::Fence fence) {
if ((Type::Internal == type) && command_buffer_.internal) {
command_buffer_.internal.end();
command_buffer_.internal.submit(view_.context_->gpu().queue, fence);
}
view_.context_->command().pool.submit(
view_.context_->gpu().queue,
command_buffer_,
fence);
}
vTensor::Buffer& vTensor::View::buffer() const {
@ -953,38 +876,28 @@ vTensor::Buffer& vTensor::View::buffer() const {
}
vTensor::Buffer& vTensor::View::buffer(
api::Command::Buffer& command_buffer,
const Stage::Flags stage,
const Access::Flags access) const {
CMD command_buffer(*this);
Buffer& buffer = this->buffer(command_buffer, stage, access);
command_buffer.submit();
return buffer;
CMD cmd(*this, command_buffer);
return buffer(cmd, stage, access);
}
vTensor::Buffer& vTensor::View::buffer(
api::Command::Buffer& command_buffer_,
const Stage::Flags stage,
const Access::Flags access) const {
CMD command_buffer(*this, command_buffer_);
return buffer(command_buffer, stage, access);
}
vTensor::Buffer& vTensor::View::buffer(
CMD& command_buffer,
CMD& cmd,
const Stage::Flags stage,
const Access::Flags access) const {
if ((access & Access::Read) && state_.is_dirty(Component::Buffer)) {
if (state_.is_clean(Component::Staging)) {
command_buffer.copy_staging_to_buffer(
cmd.copy_staging_to_buffer(
state_,
staging(command_buffer, Stage::Transfer, Access::Read).object,
staging(cmd, Stage::Transfer, Access::Read).object,
buffer().object);
}
else if (state_.is_clean(Component::Image)) {
command_buffer.copy_image_to_buffer(
cmd.copy_image_to_buffer(
state_,
image(command_buffer, Stage::Compute, Access::Read).object,
image(cmd, Stage::Compute, Access::Read).object,
buffer().object);
}
else {
@ -994,7 +907,7 @@ vTensor::Buffer& vTensor::View::buffer(
}
}
command_buffer.barrier(
cmd.barrier(
state_.transition({
// Staging
{},
@ -1028,35 +941,25 @@ vTensor::Image& vTensor::View::image() const {
}
vTensor::Image& vTensor::View::image(
api::Command::Buffer& command_buffer,
const Stage::Flags stage,
const Access::Flags access) const {
CMD command_buffer(*this);
Image& image = this->image(command_buffer, stage, access);
command_buffer.submit();
return image;
CMD cmd(*this, command_buffer);
return image(cmd, stage, access);
}
vTensor::Image& vTensor::View::image(
api::Command::Buffer& command_buffer_,
const Stage::Flags stage,
const Access::Flags access) const {
CMD command_buffer(*this, command_buffer_);
return image(command_buffer, stage, access);
}
vTensor::Image& vTensor::View::image(
CMD& command_buffer,
CMD& cmd,
const Stage::Flags stage,
const Access::Flags access) const {
if ((access & Access::Read) && state_.is_dirty(Component::Image)) {
command_buffer.copy_buffer_to_image(
cmd.copy_buffer_to_image(
state_,
buffer(command_buffer, stage, Access::Read).object,
buffer(cmd, stage, Access::Read).object,
image().object);
}
command_buffer.barrier(
cmd.barrier(
state_.transition({
// Staging
{},
@ -1096,27 +999,28 @@ vTensor::Buffer& vTensor::View::staging() const {
}
vTensor::Buffer& vTensor::View::staging(
api::Command::Buffer& command_buffer,
const Stage::Flags stage,
const Access::Flags access) const {
CMD command_buffer(*this);
Buffer& staging = this->staging(command_buffer, stage, access);
command_buffer.submit(fence());
CMD cmd(*this, command_buffer);
Buffer& staging = this->staging(cmd, stage, access);
cmd.submit(fence(access));
return staging;
}
vTensor::Buffer& vTensor::View::staging(
CMD& command_buffer,
CMD& cmd,
const Stage::Flags stage,
const Access::Flags access) const {
if ((access & Access::Read) && state_.is_dirty(Component::Staging)) {
command_buffer.copy_buffer_to_staging(
cmd.copy_buffer_to_staging(
state_,
buffer(command_buffer, Stage::Transfer, Access::Read).object,
buffer(cmd, Stage::Transfer, Access::Read).object,
staging().object);
}
command_buffer.barrier(
cmd.barrier(
state_.transition({
// Staging
{
@ -1138,6 +1042,14 @@ vTensor::Buffer& vTensor::View::staging(
return staging();
}
vTensor::Fence& vTensor::View::fence(const Access::Flags access) const {
if (access & Access::Read) {
fence_ = allocate_fence(&context_->resource().pool);
}
return fence_;
}
vTensor::Memory& vTensor::View::wait() const {
if (fence_) {
fence_.wait();
@ -1146,10 +1058,6 @@ vTensor::Memory& vTensor::View::wait() const {
return staging().memory;
}
vTensor::Fence& vTensor::View::fence() const {
return (fence_ = allocate_fence(pool_));
}
void vTensor::View::verify() const {
TORCH_INTERNAL_ASSERT(!image_ || state_.is_available(Component::Image));
TORCH_INTERNAL_ASSERT(!staging_ || state_.is_discrete());

View File

@ -157,10 +157,10 @@ class vTensor final {
*/
template<typename Type>
Future<Type, Access::Read> host() const &;
Future<Type, Access::Read> host(api::Command::Buffer&) const &;
template<typename Type, Access::Flags kAccess>
Future<Type, kAccess> host() &;
Future<Type, kAccess> host(api::Command::Buffer&) &;
/*
Device access - these functions will be expensive if they trigger a buffer
@ -178,14 +178,10 @@ class vTensor final {
predictability of usage and efficiency.
*/
Buffer::Object buffer(Stage::Flags) const &;
Buffer::Object buffer(Stage::Flags, Access::Flags) &;
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const &;
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) &;
bool has_image() const;
Image::Object image(Stage::Flags) const &;
Image::Object image(Stage::Flags, Access::Flags) &;
Image::Object image(api::Command::Buffer&, Stage::Flags) const &;
Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) &;
@ -210,26 +206,22 @@ class vTensor final {
Host
*/
const vTensor* host() const;
vTensor* host(Access::Flags access);
const vTensor* host(api::Command::Buffer&) const;
vTensor* host(api::Command::Buffer&, Access::Flags);
template<typename Type>
Future<Type, Access::Read> host() const && = delete;
Future<Type, Access::Read> host(api::Command::Buffer&) const && = delete;
template<typename Type, Access::Flags kAccess>
Future<Type, kAccess> host() && = delete;
Future<Type, kAccess> host(api::Command::Buffer&) && = delete;
/*
Device
*/
Buffer::Object buffer(Stage::Flags) const && = delete;
Buffer::Object buffer(Stage::Flags, Access::Flags) && = delete;
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const && = delete;
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;
Image::Object image(Stage::Flags) const && = delete;
Image::Object image(Stage::Flags, Access::Flags) && = delete;
Image::Object image(api::Command::Buffer&, Stage::Flags) const && = delete;
Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;
@ -249,21 +241,22 @@ class vTensor final {
~View() = default;
/*
Device
Buffer
*/
Buffer& buffer(Stage::Flags, Access::Flags) const;
Buffer& buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
/*
Image
*/
bool has_image() const;
Image& image(Stage::Flags, Access::Flags) const;
Image& image(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
/*
Host
*/
Buffer& staging(Stage::Flags, Access::Flags) const;
Buffer& staging(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
vTensor::Memory& wait() const;
@ -343,7 +336,7 @@ class vTensor final {
Image& image(CMD&, Stage::Flags, Access::Flags) const;
Buffer& staging() const;
Buffer& staging(CMD&, Stage::Flags, Access::Flags) const;
Fence& fence() const;
Fence& fence(Access::Flags) const;
// Validation
void verify() const;
@ -485,13 +478,15 @@ vTensor::Future<Type, kAccess>::wait() const & {
}
template<typename Type>
inline vTensor::Future<Type, vTensor::Access::Read> vTensor::host() const & {
return Future<Type, vTensor::Access::Read>(host());
inline vTensor::Future<Type, vTensor::Access::Read>
vTensor::host(api::Command::Buffer& command_buffer) const & {
return Future<Type, vTensor::Access::Read>(host(command_buffer));
}
template<typename Type, vTensor::Access::Flags kAccess>
inline vTensor::Future<Type, kAccess> vTensor::host() & {
return Future<Type, kAccess>(host(kAccess));
inline vTensor::Future<Type, kAccess>
vTensor::host(api::Command::Buffer& command_buffer) & {
return Future<Type, kAccess>(host(command_buffer, kAccess));
}
inline bool vTensor::has_image() const {

View File

@ -36,11 +36,11 @@ Tensor upsample_nearest2d(
input.options(),
};
api::Command::Buffer command_buffer = context->command().pool.allocate();
command_buffer.begin();
api::Command::Pool& command_pool = context->command().pool;
api::Command::Buffer& command_buffer = command_pool.stream();
{
if (v_input.has_image()) {
const struct {
if C10_LIKELY(v_input.has_image()) {
const struct Block final {
uvec3 extents;
uint32_t _;
ivec2 iextents;
@ -92,8 +92,7 @@ Tensor upsample_nearest2d(
TORCH_CHECK(false, "Not implemented!");
}
}
command_buffer.end();
command_buffer.submit(context->gpu().queue);
command_pool.submit(context->gpu().queue, command_buffer);
return convert(v_output);
}

View File

@ -10,7 +10,7 @@ namespace vulkan {
namespace ops {
namespace utils {
int64_t normalize(
inline int64_t normalize(
const int64_t dimension,
const int64_t n) {
return (dimension % n + n) % n;