mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Optimize Vulkan command buffer submission rate. (#49112)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49112 Differential Revision: D25729889 Test Plan: Imported from OSS Reviewed By: SS-JIA Pulled By: AshkanAliabadi fbshipit-source-id: c4ab470fdcf3f83745971986f3a44a3dff69287f
This commit is contained in:
parent
aa18d17455
commit
1c12cbea90
|
|
@ -207,7 +207,7 @@ cmake_dependent_option(
|
|||
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
|
||||
"LINUX" OFF)
|
||||
option(USE_VULKAN "Use Vulkan GPU backend" OFF)
|
||||
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON)
|
||||
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" OFF)
|
||||
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
|
||||
option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF)
|
||||
option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON)
|
||||
|
|
|
|||
|
|
@ -62,6 +62,10 @@ class Cache final {
|
|||
Factory factory_;
|
||||
};
|
||||
|
||||
//
|
||||
// Impl
|
||||
//
|
||||
|
||||
template<typename Factory>
|
||||
inline Cache<Factory>::Cache(Factory factory)
|
||||
: factory_(std::move(factory)) {
|
||||
|
|
|
|||
|
|
@ -76,6 +76,25 @@ Command::Buffer::Buffer(const VkCommandBuffer command_buffer)
|
|||
"Invalid Vulkan command buffer!");
|
||||
}
|
||||
|
||||
Command::Buffer::Buffer(Buffer&& buffer)
|
||||
: command_buffer_(std::move(buffer.command_buffer_)),
|
||||
bound_(std::move(buffer.bound_)),
|
||||
barriers_(std::move(buffer.barriers_)) {
|
||||
buffer.invalidate();
|
||||
}
|
||||
|
||||
Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
|
||||
if (&buffer != this) {
|
||||
command_buffer_ = std::move(buffer.command_buffer_);
|
||||
bound_ = std::move(buffer.bound_);
|
||||
barriers_ = std::move(buffer.barriers_);
|
||||
|
||||
buffer.invalidate();
|
||||
};
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
void Command::Buffer::Buffer::begin() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
command_buffer_,
|
||||
|
|
@ -107,69 +126,6 @@ void Command::Buffer::Buffer::end() {
|
|||
VK_CHECK(vkEndCommandBuffer(command_buffer_));
|
||||
}
|
||||
|
||||
void Command::Buffer::barrier() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
command_buffer_,
|
||||
"This command buffer is in an invalid state! "
|
||||
"Potential reason: This command buffer is moved from.");
|
||||
|
||||
if (barriers_.stage) {
|
||||
c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
|
||||
|
||||
for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
|
||||
buffer_memory_barriers.push_back({
|
||||
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
||||
nullptr,
|
||||
barrier.memory.src,
|
||||
barrier.memory.dst,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
barrier.object.handle,
|
||||
barrier.object.offset,
|
||||
barrier.object.range,
|
||||
});
|
||||
}
|
||||
|
||||
c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
|
||||
|
||||
for (const Resource::Image::Barrier& barrier : barriers_.images) {
|
||||
image_memory_barriers.push_back({
|
||||
VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
|
||||
nullptr,
|
||||
barrier.memory.src,
|
||||
barrier.memory.dst,
|
||||
barrier.layout.src,
|
||||
barrier.layout.dst,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
barrier.object.handle,
|
||||
{
|
||||
VK_IMAGE_ASPECT_COLOR_BIT,
|
||||
0u,
|
||||
VK_REMAINING_MIP_LEVELS,
|
||||
0u,
|
||||
VK_REMAINING_ARRAY_LAYERS,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
vkCmdPipelineBarrier(
|
||||
command_buffer_,
|
||||
barriers_.stage.src,
|
||||
barriers_.stage.dst,
|
||||
0u,
|
||||
0u,
|
||||
nullptr,
|
||||
buffer_memory_barriers.size(),
|
||||
buffer_memory_barriers.data(),
|
||||
image_memory_barriers.size(),
|
||||
image_memory_barriers.data());
|
||||
}
|
||||
|
||||
// Reset
|
||||
barriers_.reset();
|
||||
}
|
||||
|
||||
void Command::Buffer::barrier(const Pipeline::Barrier& barrier) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
command_buffer_,
|
||||
|
|
@ -291,31 +247,86 @@ void Command::Buffer::dispatch(
|
|||
bound_.pipeline.local_work_group.data[2u]));
|
||||
}
|
||||
|
||||
void Command::Buffer::submit(
|
||||
const VkQueue queue,
|
||||
const Resource::Fence fence) {
|
||||
void Command::Buffer::barrier() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
command_buffer_,
|
||||
"This command buffer is in an invalid state! "
|
||||
"Potential reason: This command buffer is moved from.");
|
||||
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
queue,
|
||||
"Invalid Vulkan queue!");
|
||||
if (barriers_.stage) {
|
||||
c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
|
||||
|
||||
const VkSubmitInfo submit_info{
|
||||
VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
||||
nullptr,
|
||||
0u,
|
||||
nullptr,
|
||||
nullptr,
|
||||
1u,
|
||||
&command_buffer_,
|
||||
0u,
|
||||
nullptr,
|
||||
};
|
||||
for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
|
||||
buffer_memory_barriers.push_back({
|
||||
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
||||
nullptr,
|
||||
barrier.memory.src,
|
||||
barrier.memory.dst,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
barrier.object.handle,
|
||||
barrier.object.offset,
|
||||
barrier.object.range,
|
||||
});
|
||||
}
|
||||
|
||||
VK_CHECK(vkQueueSubmit(queue, 1u, &submit_info, fence.handle()));
|
||||
c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
|
||||
|
||||
for (const Resource::Image::Barrier& barrier : barriers_.images) {
|
||||
image_memory_barriers.push_back({
|
||||
VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
|
||||
nullptr,
|
||||
barrier.memory.src,
|
||||
barrier.memory.dst,
|
||||
barrier.layout.src,
|
||||
barrier.layout.dst,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
VK_QUEUE_FAMILY_IGNORED,
|
||||
barrier.object.handle,
|
||||
{
|
||||
VK_IMAGE_ASPECT_COLOR_BIT,
|
||||
0u,
|
||||
VK_REMAINING_MIP_LEVELS,
|
||||
0u,
|
||||
VK_REMAINING_ARRAY_LAYERS,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
vkCmdPipelineBarrier(
|
||||
command_buffer_,
|
||||
barriers_.stage.src,
|
||||
barriers_.stage.dst,
|
||||
0u,
|
||||
0u,
|
||||
nullptr,
|
||||
buffer_memory_barriers.size(),
|
||||
buffer_memory_barriers.data(),
|
||||
image_memory_barriers.size(),
|
||||
image_memory_barriers.data());
|
||||
}
|
||||
|
||||
// Reset
|
||||
barriers_.reset();
|
||||
}
|
||||
|
||||
void Command::Buffer::invalidate() {
|
||||
command_buffer_ = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
inline void Command::Buffer::Bound::reset() {
|
||||
pipeline = {};
|
||||
descriptor_set = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
inline Command::Buffer::Barrier::Stage::operator bool() const {
|
||||
return (0u != src) || (0u != dst);
|
||||
}
|
||||
|
||||
inline void Command::Buffer::Barrier::reset() {
|
||||
stage = {};
|
||||
buffers.clear();
|
||||
images.clear();
|
||||
}
|
||||
|
||||
Command::Pool::Pool(const GPU& gpu)
|
||||
|
|
@ -338,8 +349,9 @@ Command::Pool::Pool(const GPU& gpu)
|
|||
Command::Pool::Pool(Pool&& pool)
|
||||
: device_(std::move(pool.device_)),
|
||||
command_pool_(std::move(pool.command_pool_)),
|
||||
buffer_(std::move(pool.buffer_)) {
|
||||
pool.device_ = VK_NULL_HANDLE;
|
||||
buffer_(std::move(pool.buffer_)),
|
||||
stream_(std::move(pool.stream_)) {
|
||||
pool.invalidate();
|
||||
}
|
||||
|
||||
Command::Pool& Command::Pool::operator=(Pool&& pool) {
|
||||
|
|
@ -347,8 +359,9 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) {
|
|||
device_ = std::move(pool.device_);
|
||||
command_pool_ = std::move(pool.command_pool_);
|
||||
buffer_ = std::move(pool.buffer_);
|
||||
stream_ = std::move(pool.stream_);
|
||||
|
||||
pool.device_ = VK_NULL_HANDLE;
|
||||
pool.invalidate();
|
||||
};
|
||||
|
||||
return *this;
|
||||
|
|
@ -383,25 +396,109 @@ Command::Buffer Command::Pool::allocate() {
|
|||
Configuration::kQuantum);
|
||||
|
||||
allocate_command_buffers(
|
||||
device_,
|
||||
command_pool_.get(),
|
||||
buffer_.pool.data() + buffer_.in_use,
|
||||
Configuration::kQuantum);
|
||||
device_,
|
||||
command_pool_.get(),
|
||||
buffer_.pool.data() + buffer_.in_use,
|
||||
Configuration::kQuantum);
|
||||
}
|
||||
|
||||
return Buffer(buffer_.pool[buffer_.in_use++]);
|
||||
}
|
||||
|
||||
Command::Buffer& Command::Pool::stream() {
|
||||
if (!stream_.buffer) {
|
||||
stream_.buffer = allocate();
|
||||
stream_.buffer.begin();
|
||||
stream_.counter = 0u;
|
||||
}
|
||||
|
||||
return stream_.buffer;
|
||||
}
|
||||
|
||||
void Command::Pool::purge() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
device_ && command_pool_,
|
||||
"This command pool is in an invalid state! "
|
||||
"Potential reason: This command pool is moved from.");
|
||||
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!stream_.buffer,
|
||||
"Pending command buffer detected. Make sure all command buffers are "
|
||||
"submitted to the queue for execution prior to reclaiming pool memory.");
|
||||
|
||||
buffer_.in_use = 0u;
|
||||
VK_CHECK(vkResetCommandPool(device_, command_pool_.get(), 0u));
|
||||
}
|
||||
|
||||
void Command::Pool::submit(
|
||||
const VkQueue queue,
|
||||
const c10::ArrayRef<const Buffer> buffers,
|
||||
const Resource::Fence fence) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
device_ && command_pool_,
|
||||
"This command pool is in an invalid state! "
|
||||
"Potential reason: This command pool is moved from.");
|
||||
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
queue,
|
||||
"Invalid Vulkan queue!");
|
||||
|
||||
c10::SmallVector<VkCommandBuffer, Configuration::kReserve> command_buffers;
|
||||
command_buffers.reserve(buffers.size());
|
||||
|
||||
for (const Buffer& buffer : buffers) {
|
||||
VkCommandBuffer command_buffer = buffer.handle();
|
||||
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
command_buffer,
|
||||
"Invalid Vulkan command buffer!");
|
||||
|
||||
// Are we submitting our one and only command stream, or a regular command
|
||||
// buffer whose scope is manually maintained by the user? Automatically
|
||||
// maintain state and submission rate if the former.
|
||||
|
||||
if (stream_.buffer.handle() == command_buffer) {
|
||||
// Hand the stream off to the driver if:
|
||||
// - The user has implictly signaled interest in the results via a fence.
|
||||
// - We are over the submission cutoff. We don't want to starve the GPU.
|
||||
|
||||
if (fence || (stream_.counter++ > Configuration::kSubmit)) {
|
||||
stream_.buffer.end();
|
||||
stream_.buffer.invalidate();
|
||||
}
|
||||
// Skip - Accumulate more calls prior to submission.
|
||||
else {
|
||||
command_buffer = VK_NULL_HANDLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (command_buffer) {
|
||||
command_buffers.push_back(command_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
if (!command_buffers.empty()) {
|
||||
const VkSubmitInfo submit_info{
|
||||
VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
||||
nullptr,
|
||||
0u,
|
||||
nullptr,
|
||||
nullptr,
|
||||
command_buffers.size(),
|
||||
command_buffers.data(),
|
||||
0u,
|
||||
nullptr,
|
||||
};
|
||||
|
||||
VK_CHECK(vkQueueSubmit(queue, 1u, &submit_info, fence.handle()));
|
||||
}
|
||||
}
|
||||
|
||||
void Command::Pool::invalidate() {
|
||||
device_ = VK_NULL_HANDLE;
|
||||
command_pool_.reset();
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include <ATen/native/vulkan/api/Pipeline.h>
|
||||
#include <ATen/native/vulkan/api/Resource.h>
|
||||
#include <ATen/native/vulkan/api/Shader.h>
|
||||
#include <c10/util/ArrayRef.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
|
@ -14,13 +15,15 @@ namespace vulkan {
|
|||
namespace api {
|
||||
|
||||
struct Command final {
|
||||
class Pool;
|
||||
|
||||
//
|
||||
// Buffer
|
||||
//
|
||||
|
||||
class Buffer final {
|
||||
public:
|
||||
Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
|
||||
explicit Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
|
||||
Buffer(const Buffer&) = delete;
|
||||
Buffer& operator=(const Buffer&) = delete;
|
||||
Buffer(Buffer&&);
|
||||
|
|
@ -28,18 +31,22 @@ struct Command final {
|
|||
~Buffer() = default;
|
||||
|
||||
operator bool() const;
|
||||
VkCommandBuffer handle() const;
|
||||
|
||||
void begin();
|
||||
void end();
|
||||
|
||||
void barrier(const Pipeline::Barrier& barrier);
|
||||
void bind(const Pipeline::Object& pipeline);
|
||||
void bind(const Descriptor::Set& set);
|
||||
void copy(Resource::Buffer::Object source, Resource::Buffer::Object destination);
|
||||
void dispatch(const Shader::WorkGroup& global_work_group);
|
||||
void submit(VkQueue queue, Resource::Fence fence = {});
|
||||
|
||||
private:
|
||||
friend class Pool;
|
||||
|
||||
void barrier();
|
||||
void invalidate();
|
||||
|
||||
private:
|
||||
VkCommandBuffer command_buffer_;
|
||||
|
|
@ -80,12 +87,22 @@ struct Command final {
|
|||
~Pool();
|
||||
|
||||
Buffer allocate();
|
||||
Buffer& stream();
|
||||
void purge();
|
||||
|
||||
void submit(
|
||||
VkQueue queue,
|
||||
c10::ArrayRef<const Buffer> buffers,
|
||||
Resource::Fence fence = {});
|
||||
|
||||
private:
|
||||
void invalidate();
|
||||
|
||||
private:
|
||||
struct Configuration final {
|
||||
static constexpr uint32_t kQuantum = 64u;
|
||||
static constexpr uint32_t kReserve = 1024u;
|
||||
static constexpr uint32_t kQuantum = 4u;
|
||||
static constexpr uint32_t kReserve = 16u;
|
||||
static constexpr uint32_t kSubmit = 10u;
|
||||
};
|
||||
|
||||
VkDevice device_;
|
||||
|
|
@ -95,6 +112,11 @@ struct Command final {
|
|||
std::vector<VkCommandBuffer> pool;
|
||||
size_t in_use;
|
||||
} buffer_;
|
||||
|
||||
struct {
|
||||
Buffer buffer;
|
||||
uint32_t counter;
|
||||
} stream_;
|
||||
} pool /* [thread_count] */;
|
||||
|
||||
explicit Command(const GPU& gpu)
|
||||
|
|
@ -106,43 +128,12 @@ struct Command final {
|
|||
// Impl
|
||||
//
|
||||
|
||||
inline Command::Buffer::Buffer(Buffer&& buffer)
|
||||
: command_buffer_(std::move(buffer.command_buffer_)),
|
||||
bound_(std::move(buffer.bound_)),
|
||||
barriers_(std::move(buffer.barriers_)) {
|
||||
buffer.command_buffer_ = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
inline Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
|
||||
if (&buffer != this) {
|
||||
command_buffer_ = std::move(buffer.command_buffer_);
|
||||
bound_ = std::move(buffer.bound_);
|
||||
barriers_ = std::move(buffer.barriers_);
|
||||
|
||||
buffer.command_buffer_ = VK_NULL_HANDLE;
|
||||
};
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline Command::Buffer::operator bool() const {
|
||||
return VK_NULL_HANDLE != command_buffer_;
|
||||
}
|
||||
|
||||
inline void Command::Buffer::Bound::reset() {
|
||||
pipeline = {};
|
||||
descriptor_set = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
inline Command::Buffer::Barrier::Stage::operator bool() const {
|
||||
return (0u != src) ||
|
||||
(0u != dst);
|
||||
}
|
||||
|
||||
inline void Command::Buffer::Barrier::reset() {
|
||||
stage = {};
|
||||
buffers.clear();
|
||||
images.clear();
|
||||
inline VkCommandBuffer Command::Buffer::handle() const {
|
||||
return command_buffer_;
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
|
|
|
|||
|
|
@ -6,10 +6,17 @@
|
|||
|
||||
#ifdef USE_VULKAN_SHADERC_RUNTIME
|
||||
#include <ATen/native/vulkan/glsl.h>
|
||||
#define VK_KERNEL(name) { name##_glsl, }
|
||||
#define VK_KERNEL(name) \
|
||||
::at::native::vulkan::api::Shader::Descriptor{ \
|
||||
name##_glsl, \
|
||||
}
|
||||
#else
|
||||
#include <ATen/native/vulkan/spv.h>
|
||||
#define VK_KERNEL(name) { name##_spv, name##_spv_len, }
|
||||
#define VK_KERNEL(name) \
|
||||
::at::native::vulkan::api::Shader::Descriptor{ \
|
||||
name##_spv, \
|
||||
name##_spv_len, \
|
||||
}
|
||||
#endif /* USE_VULKAN_SHADERC_RUNTIME */
|
||||
|
||||
#ifdef USE_VULKAN_WRAPPER
|
||||
|
|
|
|||
|
|
@ -43,6 +43,40 @@ VkDevice create_device(
|
|||
&queue_priorities,
|
||||
};
|
||||
|
||||
uint32_t device_extension_properties_count = 0;
|
||||
VK_CHECK(vkEnumerateDeviceExtensionProperties(
|
||||
physical_device,
|
||||
nullptr,
|
||||
&device_extension_properties_count,
|
||||
nullptr));
|
||||
|
||||
std::vector<VkExtensionProperties> device_extension_properties(
|
||||
device_extension_properties_count);
|
||||
|
||||
VK_CHECK(vkEnumerateDeviceExtensionProperties(
|
||||
physical_device,
|
||||
nullptr,
|
||||
&device_extension_properties_count,
|
||||
device_extension_properties.data()));
|
||||
|
||||
constexpr const char* const requested_device_extensions[]{
|
||||
#ifdef VK_KHR_portability_subset
|
||||
// https://vulkan.lunarg.com/doc/view/1.2.162.0/mac/1.2-extensions/vkspec.html#VUID-VkDeviceCreateInfo-pProperties-04451
|
||||
VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
|
||||
#endif
|
||||
};
|
||||
|
||||
std::vector<const char*> enabled_device_extensions;
|
||||
|
||||
for (const auto& requested_device_extension : requested_device_extensions) {
|
||||
for (const auto& extension : device_extension_properties) {
|
||||
if (strcmp(requested_device_extension, extension.extensionName) == 0) {
|
||||
enabled_device_extensions.push_back(requested_device_extension);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const VkDeviceCreateInfo device_create_info{
|
||||
VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
|
||||
nullptr,
|
||||
|
|
@ -51,7 +85,8 @@ VkDevice create_device(
|
|||
&device_queue_create_info,
|
||||
0u,
|
||||
nullptr,
|
||||
0u,
|
||||
static_cast<uint32_t>(enabled_device_extensions.size()),
|
||||
enabled_device_extensions.data(),
|
||||
nullptr,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -128,27 +128,25 @@ Descriptor::Set::Set(
|
|||
"Invalid Vulkan descriptor set!");
|
||||
}
|
||||
|
||||
void Descriptor::Set::update(const Item& item) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
device_ && descriptor_set_,
|
||||
"This descriptor set is in an invalid state! "
|
||||
"Potential reason: This descriptor set is moved from.");
|
||||
Descriptor::Set::Set(Set&& set)
|
||||
: device_(std::move(set.device_)),
|
||||
descriptor_set_(std::move(set.descriptor_set_)),
|
||||
shader_layout_signature_(std::move(set.shader_layout_signature_)),
|
||||
bindings_(std::move(set.bindings_)) {
|
||||
set.invalidate();
|
||||
}
|
||||
|
||||
const auto items_itr = std::find_if(
|
||||
bindings_.items.begin(),
|
||||
bindings_.items.end(),
|
||||
[binding = item.binding](const Item& other) {
|
||||
return other.binding == binding;
|
||||
});
|
||||
Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
|
||||
if (&set != this) {
|
||||
device_ = std::move(set.device_);
|
||||
descriptor_set_ = std::move(set.descriptor_set_);
|
||||
shader_layout_signature_ = std::move(set.shader_layout_signature_);
|
||||
bindings_ = std::move(set.bindings_);
|
||||
|
||||
if (bindings_.items.end() == items_itr) {
|
||||
bindings_.items.emplace_back(item);
|
||||
}
|
||||
else {
|
||||
*items_itr = item;
|
||||
}
|
||||
set.invalidate();
|
||||
};
|
||||
|
||||
bindings_.dirty = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Descriptor::Set& Descriptor::Set::bind(
|
||||
|
|
@ -276,12 +274,39 @@ VkDescriptorSet Descriptor::Set::handle() const {
|
|||
return descriptor_set_;
|
||||
}
|
||||
|
||||
void Descriptor::Set::invalidate() {
|
||||
device_ = VK_NULL_HANDLE;
|
||||
descriptor_set_ = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
void Descriptor::Set::update(const Item& item) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
device_ && descriptor_set_,
|
||||
"This descriptor set is in an invalid state! "
|
||||
"Potential reason: This descriptor set is moved from.");
|
||||
|
||||
const auto items_itr = std::find_if(
|
||||
bindings_.items.begin(),
|
||||
bindings_.items.end(),
|
||||
[binding = item.binding](const Item& other) {
|
||||
return other.binding == binding;
|
||||
});
|
||||
|
||||
if (bindings_.items.end() == items_itr) {
|
||||
bindings_.items.emplace_back(item);
|
||||
}
|
||||
else {
|
||||
*items_itr = item;
|
||||
}
|
||||
|
||||
bindings_.dirty = true;
|
||||
}
|
||||
|
||||
Descriptor::Pool::Pool(const GPU& gpu)
|
||||
: device_(gpu.device),
|
||||
descriptor_pool_(
|
||||
create_descriptor_pool(gpu.device),
|
||||
VK_DELETER(DescriptorPool)(device_)),
|
||||
set_{} {
|
||||
VK_DELETER(DescriptorPool)(device_)) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
device_,
|
||||
"Invalid Vulkan device!");
|
||||
|
|
@ -295,7 +320,7 @@ Descriptor::Pool::Pool(Pool&& pool)
|
|||
: device_(std::move(pool.device_)),
|
||||
descriptor_pool_(std::move(pool.descriptor_pool_)),
|
||||
set_(std::move(pool.set_)) {
|
||||
pool.device_ = VK_NULL_HANDLE;
|
||||
pool.invalidate();
|
||||
}
|
||||
|
||||
Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
|
||||
|
|
@ -304,7 +329,7 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
|
|||
descriptor_pool_ = std::move(pool.descriptor_pool_);
|
||||
set_ = std::move(pool.set_);
|
||||
|
||||
pool.device_ = VK_NULL_HANDLE;
|
||||
pool.invalidate();
|
||||
};
|
||||
|
||||
return *this;
|
||||
|
|
@ -371,8 +396,13 @@ void Descriptor::Pool::purge() {
|
|||
"This descriptor pool is in an invalid state! "
|
||||
"Potential reason: This descriptor pool is moved from.");
|
||||
|
||||
set_.layouts.clear();
|
||||
VK_CHECK(vkResetDescriptorPool(device_, descriptor_pool_.get(), 0u));
|
||||
set_.layouts.clear();
|
||||
}
|
||||
|
||||
void Descriptor::Pool::invalidate() {
|
||||
device_ = VK_NULL_HANDLE;
|
||||
descriptor_pool_.reset();
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
|
|
|
|||
|
|
@ -73,6 +73,9 @@ struct Descriptor final {
|
|||
|
||||
VkDescriptorSet handle() const;
|
||||
|
||||
private:
|
||||
void invalidate();
|
||||
|
||||
private:
|
||||
struct Item final {
|
||||
uint32_t binding;
|
||||
|
|
@ -113,6 +116,9 @@ struct Descriptor final {
|
|||
Set allocate(const Shader::Layout::Object& shader_layout);
|
||||
void purge();
|
||||
|
||||
private:
|
||||
void invalidate();
|
||||
|
||||
private:
|
||||
struct Configuration final {
|
||||
static constexpr uint32_t kQuantum = 16u;
|
||||
|
|
@ -137,33 +143,6 @@ struct Descriptor final {
|
|||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Impl
|
||||
//
|
||||
|
||||
inline Descriptor::Set::Set(Set&& set)
|
||||
: device_(std::move(set.device_)),
|
||||
descriptor_set_(std::move(set.descriptor_set_)),
|
||||
shader_layout_signature_(std::move(set.shader_layout_signature_)),
|
||||
bindings_(std::move(set.bindings_)) {
|
||||
set.device_ = VK_NULL_HANDLE;
|
||||
set.descriptor_set_ = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
inline Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
|
||||
if (&set != this) {
|
||||
device_ = std::move(set.device_);
|
||||
descriptor_set_ = std::move(set.descriptor_set_);
|
||||
shader_layout_signature_ = std::move(set.shader_layout_signature_);
|
||||
bindings_ = std::move(set.bindings_);
|
||||
|
||||
set.device_ = VK_NULL_HANDLE;
|
||||
set.descriptor_set_ = VK_NULL_HANDLE;
|
||||
};
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -169,6 +169,10 @@ Pipeline::Cache::Cache(Factory factory)
|
|||
: cache_(std::move(factory)) {
|
||||
}
|
||||
|
||||
void Pipeline::Cache::purge() {
|
||||
cache_.purge();
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -196,7 +196,11 @@ inline Pipeline::Barrier::operator bool() const {
|
|||
inline bool operator==(
|
||||
const Pipeline::Layout::Descriptor& _1,
|
||||
const Pipeline::Layout::Descriptor& _2) {
|
||||
return (_1.descriptor_set_layout == _2.descriptor_set_layout);
|
||||
static_assert(
|
||||
std::is_trivially_copyable<Pipeline::Layout::Descriptor>::value,
|
||||
"This implementation is no longer valid!");
|
||||
|
||||
return (0 == memcmp(&_1, &_2, sizeof(Pipeline::Layout::Descriptor)));
|
||||
}
|
||||
|
||||
inline size_t Pipeline::Layout::Factory::Hasher::operator()(
|
||||
|
|
@ -207,9 +211,11 @@ inline size_t Pipeline::Layout::Factory::Hasher::operator()(
|
|||
inline bool operator==(
|
||||
const Pipeline::Descriptor& _1,
|
||||
const Pipeline::Descriptor& _2) {
|
||||
return (_1.pipeline_layout == _2.pipeline_layout) &&
|
||||
(_1.shader_module == _2.shader_module) &&
|
||||
(_1.local_work_group == _2.local_work_group);
|
||||
static_assert(
|
||||
std::is_trivially_copyable<Pipeline::Descriptor>::value,
|
||||
"This implementation is no longer valid!");
|
||||
|
||||
return (0 == memcmp(&_1, &_2, sizeof(Pipeline::Descriptor)));
|
||||
}
|
||||
|
||||
inline size_t Pipeline::Factory::Hasher::operator()(
|
||||
|
|
@ -236,10 +242,6 @@ inline Pipeline::Object Pipeline::Cache::retrieve(
|
|||
};
|
||||
}
|
||||
|
||||
inline void Pipeline::Cache::purge() {
|
||||
cache_.purge();
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -192,6 +192,11 @@ VkFence Resource::Fence::handle(const bool add_to_waitlist) const {
|
|||
"Invalid Vulkan fence!");
|
||||
|
||||
const VkFence fence = pool->fence_.pool[id].get();
|
||||
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
fence,
|
||||
"Invalid Vulkan fence!");
|
||||
|
||||
if (add_to_waitlist) {
|
||||
pool->fence_.waitlist.push_back(fence);
|
||||
}
|
||||
|
|
@ -360,14 +365,13 @@ Resource::Pool::Pool(
|
|||
: device_(gpu.device),
|
||||
allocator_(
|
||||
create_allocator(
|
||||
gpu.adapter->runtime->instance(),
|
||||
gpu.adapter->handle,
|
||||
device_),
|
||||
gpu.adapter->runtime->instance(),
|
||||
gpu.adapter->handle,
|
||||
device_),
|
||||
vmaDestroyAllocator),
|
||||
memory_{
|
||||
std::move(policy),
|
||||
},
|
||||
buffer_{},
|
||||
image_{
|
||||
.sampler = Image::Sampler{gpu},
|
||||
},
|
||||
|
|
@ -377,6 +381,31 @@ Resource::Pool::Pool(
|
|||
fence_.pool.reserve(Configuration::kReserve);
|
||||
}
|
||||
|
||||
Resource::Pool::Pool(Pool&& pool)
|
||||
: device_(std::move(pool.device_)),
|
||||
allocator_(std::move(pool.allocator_)),
|
||||
memory_(std::move(pool.memory_)),
|
||||
buffer_(std::move(pool.buffer_)),
|
||||
image_(std::move(pool.image_)),
|
||||
fence_(std::move(pool.fence_)) {
|
||||
pool.invalidate();
|
||||
}
|
||||
|
||||
Resource::Pool& Resource::Pool::operator=(Pool&& pool) {
|
||||
if (&pool != this) {
|
||||
device_ = std::move(pool.device_);
|
||||
allocator_ = std::move(pool.allocator_);
|
||||
memory_ = std::move(pool.memory_);
|
||||
buffer_ = std::move(pool.buffer_);
|
||||
image_ = std::move(pool.image_);
|
||||
fence_ = std::move(pool.fence_);
|
||||
|
||||
pool.invalidate();
|
||||
};
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
Resource::Pool::~Pool() {
|
||||
try {
|
||||
if (device_ && allocator_) {
|
||||
|
|
@ -394,31 +423,6 @@ Resource::Pool::~Pool() {
|
|||
}
|
||||
}
|
||||
|
||||
Resource::Pool::Pool(Pool&& pool)
|
||||
: device_(std::move(pool.device_)),
|
||||
allocator_(std::move(pool.allocator_)),
|
||||
memory_(std::move(pool.memory_)),
|
||||
buffer_(std::move(pool.buffer_)),
|
||||
image_(std::move(pool.image_)),
|
||||
fence_(std::move(pool.fence_)) {
|
||||
pool.device_ = VK_NULL_HANDLE;
|
||||
}
|
||||
|
||||
Resource::Pool& Resource::Pool::operator=(Pool&& pool) {
|
||||
if (&pool != this) {
|
||||
device_ = std::move(pool.device_);
|
||||
allocator_ = std::move(pool.allocator_);
|
||||
memory_ = std::move(pool.memory_);
|
||||
buffer_ = std::move(pool.buffer_);
|
||||
image_ = std::move(pool.image_);
|
||||
fence_ = std::move(pool.fence_);
|
||||
|
||||
pool.device_ = VK_NULL_HANDLE;
|
||||
};
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
Resource::Buffer Resource::Pool::buffer(
|
||||
const Buffer::Descriptor& descriptor) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
|
|
@ -678,6 +682,11 @@ void Resource::Pool::purge() {
|
|||
buffer_.pool.clear();
|
||||
}
|
||||
|
||||
void Resource::Pool::invalidate() {
|
||||
device_ = VK_NULL_HANDLE;
|
||||
allocator_.reset();
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -20,15 +20,6 @@ struct Resource final {
|
|||
//
|
||||
|
||||
struct Memory final {
|
||||
/*
|
||||
Barrier
|
||||
*/
|
||||
|
||||
struct Barrier final {
|
||||
VkAccessFlags src;
|
||||
VkAccessFlags dst;
|
||||
};
|
||||
|
||||
/*
|
||||
Descriptor
|
||||
*/
|
||||
|
|
@ -39,8 +30,18 @@ struct Resource final {
|
|||
VkMemoryPropertyFlags /* optional */ preferred;
|
||||
};
|
||||
|
||||
VmaAllocator allocator;
|
||||
VmaAllocation allocation;
|
||||
/*
|
||||
Barrier
|
||||
*/
|
||||
|
||||
struct Barrier final {
|
||||
VkAccessFlags src;
|
||||
VkAccessFlags dst;
|
||||
};
|
||||
|
||||
/*
|
||||
Access
|
||||
*/
|
||||
|
||||
struct Access final {
|
||||
typedef uint8_t Flags;
|
||||
|
|
@ -74,6 +75,9 @@ struct Resource final {
|
|||
typename Pointer = Access::Pointer<Type, kAccess>>
|
||||
Handle<Pointer> map() &;
|
||||
|
||||
VmaAllocator allocator;
|
||||
VmaAllocation allocation;
|
||||
|
||||
private:
|
||||
// Intentionally disabed to ensure memory access is always properly
|
||||
// encapsualted in a scoped map-unmap region. Allowing below overloads
|
||||
|
|
@ -299,6 +303,8 @@ struct Resource final {
|
|||
private:
|
||||
friend struct Fence;
|
||||
|
||||
void invalidate();
|
||||
|
||||
private:
|
||||
struct Configuration final {
|
||||
static constexpr uint32_t kReserve = 256u;
|
||||
|
|
@ -353,7 +359,8 @@ class Resource::Memory::Scope final {
|
|||
|
||||
template<typename, typename Pointer>
|
||||
inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
|
||||
void* map(const Memory& memory, Access::Flags);
|
||||
// Forward declaration
|
||||
void* map(const Memory&, Access::Flags);
|
||||
|
||||
return Handle<Pointer>{
|
||||
reinterpret_cast<Pointer>(map(*this, Access::Read)),
|
||||
|
|
@ -363,7 +370,8 @@ inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
|
|||
|
||||
template<typename, Resource::Memory::Access::Flags kAccess, typename Pointer>
|
||||
inline Resource::Memory::Handle<Pointer> Resource::Memory::map() & {
|
||||
void* map(const Memory& memory, Access::Flags);
|
||||
// Forward declaration
|
||||
void* map(const Memory&, Access::Flags);
|
||||
|
||||
static_assert(
|
||||
(kAccess == Access::Read) ||
|
||||
|
|
@ -388,10 +396,11 @@ inline Resource::Buffer::operator bool() const {
|
|||
inline bool operator==(
|
||||
const Resource::Image::Sampler::Descriptor& _1,
|
||||
const Resource::Image::Sampler::Descriptor& _2) {
|
||||
return (_1.filter == _2.filter) &&
|
||||
(_1.mipmap_mode == _2.mipmap_mode) &&
|
||||
(_1.address_mode == _2.address_mode) &&
|
||||
(_1.border == _2.border);
|
||||
static_assert(
|
||||
std::is_trivially_copyable<Resource::Image::Sampler::Descriptor>::value,
|
||||
"This implementation is no longer valid!");
|
||||
|
||||
return (0 == memcmp(&_1, &_2, sizeof(Resource::Image::Sampler::Descriptor)));
|
||||
}
|
||||
|
||||
inline size_t Resource::Image::Sampler::Factory::Hasher::operator()(
|
||||
|
|
|
|||
|
|
@ -86,7 +86,9 @@ VkInstance create_instance(const Runtime::Type type) {
|
|||
nullptr, &instance_extension_count, instance_extension_properties.data()));
|
||||
|
||||
constexpr const char* const requested_instance_extensions[]{
|
||||
#ifdef VK_EXT_debug_report
|
||||
VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
|
||||
#endif
|
||||
};
|
||||
|
||||
for (const auto& requested_instance_extension : requested_instance_extensions) {
|
||||
|
|
|
|||
|
|
@ -33,10 +33,7 @@ class Runtime final {
|
|||
Runtime& operator=(Runtime&&) = default;
|
||||
~Runtime() = default;
|
||||
|
||||
inline VkInstance instance() const {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
|
||||
return instance_.get();
|
||||
}
|
||||
VkInstance instance() const;
|
||||
|
||||
typedef std::function<bool (const Adapter&)> Selector;
|
||||
Adapter select(const Selector& selector);
|
||||
|
|
@ -59,6 +56,15 @@ class Runtime final {
|
|||
|
||||
Runtime* runtime();
|
||||
|
||||
//
|
||||
// Impl
|
||||
//
|
||||
|
||||
inline VkInstance Runtime::instance() const {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
|
||||
return instance_.get();
|
||||
}
|
||||
|
||||
} // namespace api
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -60,6 +60,10 @@ Shader::Layout::Cache::Cache(Factory factory)
|
|||
: cache_(std::move(factory)) {
|
||||
}
|
||||
|
||||
void Shader::Layout::Cache::purge() {
|
||||
cache_.purge();
|
||||
}
|
||||
|
||||
#ifdef USE_VULKAN_SHADERC_RUNTIME
|
||||
|
||||
struct Shader::Factory::Compiler final {
|
||||
|
|
|
|||
|
|
@ -218,16 +218,14 @@ inline Shader::Layout::Object Shader::Layout::Cache::retrieve(
|
|||
};
|
||||
}
|
||||
|
||||
inline void Shader::Layout::Cache::purge() {
|
||||
cache_.purge();
|
||||
}
|
||||
|
||||
inline bool operator==(
|
||||
const Shader::WorkGroup& _1,
|
||||
const Shader::WorkGroup& _2) {
|
||||
return (_1.data[0u] == _2.data[0u]) &&
|
||||
(_1.data[1u] == _2.data[1u]) &&
|
||||
(_1.data[2u] == _2.data[2u]);
|
||||
static_assert(
|
||||
std::is_trivially_copyable<Shader::WorkGroup>::value,
|
||||
"This implementation is no longer valid!");
|
||||
|
||||
return (0 == memcmp(&_1, &_2, sizeof(Shader::WorkGroup)));
|
||||
}
|
||||
|
||||
inline Shader::Descriptor::Descriptor(const char* const glsl)
|
||||
|
|
@ -258,12 +256,10 @@ inline bool operator==(
|
|||
const Shader::Descriptor& _1,
|
||||
const Shader::Descriptor& _2) {
|
||||
static_assert(
|
||||
sizeof(Shader::Descriptor::shader.source) == sizeof(Shader::Descriptor::shader.binary),
|
||||
"This implementation requires sizeof(Source) to be equal to sizeof(Binary).");
|
||||
std::is_trivially_copyable<Shader::Descriptor>::value,
|
||||
"This implementation is no longer valid!");
|
||||
|
||||
return (_1.type == _2.type) &&
|
||||
(_1.shader.binary.spirv == _2.shader.binary.spirv) &&
|
||||
(_1.shader.binary.size == _2.shader.binary.size);
|
||||
return (0 == memcmp(&_1, &_2, sizeof(Shader::Descriptor)));
|
||||
}
|
||||
|
||||
inline size_t Shader::Factory::Hasher::operator()(
|
||||
|
|
@ -286,11 +282,11 @@ inline size_t Shader::Factory::Hasher::operator()(
|
|||
inline bool operator==(
|
||||
const VkDescriptorSetLayoutBinding& _1,
|
||||
const VkDescriptorSetLayoutBinding& _2) {
|
||||
return (_1.binding == _2.binding) &&
|
||||
(_1.descriptorType == _2.descriptorType) &&
|
||||
(_1.descriptorCount == _2.descriptorCount) &&
|
||||
(_1.stageFlags == _2.stageFlags) &&
|
||||
(_1.pImmutableSamplers == _2.pImmutableSamplers);
|
||||
static_assert(
|
||||
std::is_trivially_copyable<VkDescriptorSetLayoutBinding>::value,
|
||||
"This implementation is no longer valid!");
|
||||
|
||||
return (0 == memcmp(&_1, &_2, sizeof(VkDescriptorSetLayoutBinding)));
|
||||
}
|
||||
|
||||
#endif /* USE_VULKAN_API */
|
||||
|
|
|
|||
|
|
@ -24,11 +24,11 @@ Tensor add_scalar(
|
|||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_output.has_image() && v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
float other;
|
||||
} block {
|
||||
|
|
@ -64,8 +64,7 @@ Tensor add_scalar(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
@ -82,11 +81,11 @@ Tensor& add_scalar_(
|
|||
|
||||
vTensor& v_self = convert(self);
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
float other;
|
||||
} block {
|
||||
|
|
@ -116,8 +115,7 @@ Tensor& add_scalar_(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
|
@ -140,11 +138,11 @@ Tensor add_tensor(
|
|||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_self.has_image() && v_other.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_self.has_image() && v_other.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
float alpha;
|
||||
} block {
|
||||
|
|
@ -186,8 +184,7 @@ Tensor add_tensor(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
@ -207,11 +204,11 @@ Tensor& add_tensor_(
|
|||
const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan();
|
||||
const vTensor& v_other = convert(other);
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
float alpha;
|
||||
} block {
|
||||
|
|
@ -247,8 +244,7 @@ Tensor& add_tensor_(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,11 +28,11 @@ Tensor clamp(
|
|||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_output.has_image() && v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t _;
|
||||
vec2 clamp;
|
||||
|
|
@ -73,8 +73,7 @@ Tensor clamp(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
@ -95,11 +94,11 @@ Tensor& clamp_(
|
|||
|
||||
vTensor& v_self = convert(self);
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t _;
|
||||
vec2 clamp;
|
||||
|
|
@ -134,8 +133,7 @@ Tensor& clamp_(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,14 +35,6 @@ struct Layout final {
|
|||
};
|
||||
};
|
||||
|
||||
struct Experimentation {
|
||||
static constexpr bool kUseConv2dOldApi = false;
|
||||
};
|
||||
|
||||
struct ConvPrepackLimits final {
|
||||
static constexpr int64_t maxStackDepth = 2048*4;
|
||||
};
|
||||
|
||||
} // namespace ops
|
||||
} // namespace vulkan
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
#include <ATen/native/vulkan/ops/Convolution.h>
|
||||
#include <ATen/native/vulkan/api/Utils.h>
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/native/utils/ParamUtils.h>
|
||||
#include <ATen/native/vulkan/ops/Persistent.h>
|
||||
#include <ATen/native/vulkan/api/Utils.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
|
@ -12,6 +12,10 @@ namespace {
|
|||
|
||||
using namespace api::utils;
|
||||
|
||||
struct Experimentation final {
|
||||
static constexpr bool kUseConv2dOldApi = false;
|
||||
};
|
||||
|
||||
inline bool is_depthwise(
|
||||
const IntArrayRef filter,
|
||||
const int64_t groups) {
|
||||
|
|
@ -26,47 +30,103 @@ inline bool is_pointwise(const IntArrayRef filter) {
|
|||
}
|
||||
|
||||
vTensor pack_weights_dw(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight_arg,
|
||||
const int64_t groups) {
|
||||
if (weight_arg.is_vulkan()) {
|
||||
return convert(weight_arg);
|
||||
}
|
||||
|
||||
const Tensor& weight) {
|
||||
/* Source */
|
||||
|
||||
const Tensor weight = weight_arg.contiguous();
|
||||
const IntArrayRef src_filter = weight.sizes();
|
||||
const float* const src_weight_ptr = weight.data_ptr<float>();
|
||||
|
||||
const int64_t src_kw_sz = src_filter[Layout::Filter::width];
|
||||
const int64_t src_kh_sz = src_filter[Layout::Filter::height];
|
||||
const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
|
||||
const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input];
|
||||
const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
|
||||
|
||||
/* Destination */
|
||||
const int64_t dst_kw_sz = src_kernel_sz;
|
||||
const int64_t dst_kh_sz = num_stacks;
|
||||
const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
|
||||
|
||||
vTensor v_weight{
|
||||
api::context(),
|
||||
context,
|
||||
&pool,
|
||||
{
|
||||
4,
|
||||
num_stacks,
|
||||
src_kw_sz * src_kh_sz,
|
||||
dst_kh_sz,
|
||||
dst_kw_sz,
|
||||
},
|
||||
weight.options(),
|
||||
};
|
||||
|
||||
using Future = vTensor::Future<float, vTensor::Access::Write>;
|
||||
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
|
||||
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
|
||||
Future::Payload v_weight_payload = v_weight_future.wait();
|
||||
|
||||
float* const dst_weight_ptr = v_weight_payload.get();
|
||||
memset(dst_weight_ptr, 0, v_weight.nbytes());
|
||||
|
||||
for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
|
||||
/* Source */
|
||||
const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
|
||||
|
||||
/* Destination */
|
||||
const int64_t dst_oh = src_oc / 4;
|
||||
const int64_t dst_c = src_oc % 4;
|
||||
|
||||
float* const dst_weight_c_ptr = dst_weight_ptr +
|
||||
dst_c * dst_kernel_sz +
|
||||
dst_oh * dst_kw_sz;
|
||||
|
||||
for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
|
||||
memcpy(
|
||||
dst_weight_c_ptr + src_ih * src_kw_sz,
|
||||
src_weight_oc_ptr + src_ih * src_kw_sz,
|
||||
sizeof(float) * src_kw_sz);
|
||||
}
|
||||
}
|
||||
|
||||
return v_weight;
|
||||
}
|
||||
|
||||
vTensor pack_weights_2d(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight) {
|
||||
/* Source */
|
||||
const IntArrayRef src_filter = weight.sizes();
|
||||
const float* const src_weight_ptr = weight.data_ptr<float>();
|
||||
|
||||
const int64_t src_kw_sz = src_filter[Layout::Filter::width];
|
||||
const int64_t src_kh_sz = src_filter[Layout::Filter::height];
|
||||
const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
|
||||
const int64_t src_block_sz =
|
||||
src_kernel_sz * src_filter[Layout::Filter::input];
|
||||
const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input];
|
||||
|
||||
const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
|
||||
const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
|
||||
|
||||
/* Destination */
|
||||
const int64_t dst_kw_sz = src_kw_sz * src_kh_sz;
|
||||
const int64_t dst_kh_sz = num_stacks;
|
||||
const int64_t dst_kw_sz = src_kw_sz * stack_depth;
|
||||
const int64_t dst_kh_sz = src_kh_sz * num_stacks;
|
||||
const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
|
||||
|
||||
vTensor v_weight{
|
||||
context,
|
||||
&pool,
|
||||
{
|
||||
4,
|
||||
dst_kh_sz,
|
||||
dst_kw_sz,
|
||||
},
|
||||
weight.options(),
|
||||
};
|
||||
|
||||
using Future = vTensor::Future<float, vTensor::Access::Write>;
|
||||
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
|
||||
Future::Payload v_weight_payload = v_weight_future.wait();
|
||||
|
||||
float* const dst_weight_ptr = v_weight_payload.get();
|
||||
memset(dst_weight_ptr, 0, v_weight.nbytes());
|
||||
|
||||
|
|
@ -80,26 +140,29 @@ vTensor pack_weights_dw(
|
|||
|
||||
float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
|
||||
|
||||
for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
|
||||
memcpy(
|
||||
dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz,
|
||||
src_weight_oc_ptr + src_ih * src_kw_sz,
|
||||
sizeof(float) * src_kw_sz);
|
||||
for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
|
||||
const int64_t dst_ic4 = src_ic / 4;
|
||||
|
||||
for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
|
||||
for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
|
||||
memcpy(
|
||||
dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
|
||||
dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
|
||||
src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
|
||||
sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v_weight;
|
||||
}
|
||||
|
||||
vTensor pack_weights_old(
|
||||
vTensor pack_weights_2d_old(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight_arg,
|
||||
const int64_t groups) {
|
||||
if (weight_arg.is_vulkan()) {
|
||||
return convert(weight_arg);
|
||||
}
|
||||
|
||||
const Tensor weight = weight_arg.contiguous();
|
||||
const Tensor& weight) {
|
||||
const IntArrayRef src_filter = weight.sizes();
|
||||
const float* const src_weight_ptr = weight.data_ptr<float>();
|
||||
|
||||
|
|
@ -111,7 +174,7 @@ vTensor pack_weights_old(
|
|||
const uint32_t KW = src_filter[Layout::Filter::width];
|
||||
|
||||
vTensor v_weight{
|
||||
api::context(),
|
||||
context,
|
||||
&pool,
|
||||
{
|
||||
1,
|
||||
|
|
@ -123,13 +186,13 @@ vTensor pack_weights_old(
|
|||
};
|
||||
|
||||
using Future = vTensor::Future<float, vTensor::Access::Write>;
|
||||
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
|
||||
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
|
||||
Future::Payload v_weight_payload = v_weight_future.wait();
|
||||
|
||||
float* const dst_weight_ptr = v_weight_payload.get();
|
||||
memset(dst_weight_ptr, 0, v_weight.nbytes());
|
||||
|
||||
const float* src = src_weight_ptr;
|
||||
const float* const src = src_weight_ptr;
|
||||
float* const dst = dst_weight_ptr;
|
||||
|
||||
{
|
||||
|
|
@ -162,7 +225,7 @@ vTensor pack_weights_old(
|
|||
dim0_ = dim0;
|
||||
dim1_ = dim1;
|
||||
dim2_ = dim2;
|
||||
data_ = new float[dim0 * dim1 * dim2 * 4];
|
||||
data_ = new float[dim0 * dim1 * dim2 * 4]; // TODO: memory leak
|
||||
memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
|
||||
}
|
||||
|
||||
|
|
@ -211,7 +274,7 @@ vTensor pack_weights_old(
|
|||
return v_weight;
|
||||
}
|
||||
|
||||
vTensor pack_weights_2d(
|
||||
vTensor pack_weights(
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight_arg,
|
||||
const int64_t groups) {
|
||||
|
|
@ -219,81 +282,32 @@ vTensor pack_weights_2d(
|
|||
return convert(weight_arg);
|
||||
}
|
||||
|
||||
api::Context* const context = api::context();
|
||||
api::Command::Buffer& command_buffer = context->command().pool.stream();
|
||||
|
||||
const Tensor weight = weight_arg.contiguous();
|
||||
const IntArrayRef src_filter = weight.sizes();
|
||||
const float* const src_weight_ptr = weight.data_ptr<float>();
|
||||
|
||||
const int64_t src_kw_sz = src_filter[Layout::Filter::width];
|
||||
const int64_t src_kh_sz = src_filter[Layout::Filter::height];
|
||||
const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
|
||||
const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
|
||||
vTensor v_weight{
|
||||
api::context(),
|
||||
&pool,
|
||||
{
|
||||
4,
|
||||
src_kh_sz * num_stacks,
|
||||
src_kw_sz * stack_depth,
|
||||
},
|
||||
weight.options(),
|
||||
};
|
||||
|
||||
using Future = vTensor::Future<float, vTensor::Access::Write>;
|
||||
Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
|
||||
Future::Payload v_weight_payload = v_weight_future.wait();
|
||||
|
||||
/* Source */
|
||||
const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
|
||||
const int64_t src_block_sz =
|
||||
src_kernel_sz * src_filter[Layout::Filter::input];
|
||||
|
||||
/* Destination */
|
||||
const int64_t dst_kw_sz = src_kw_sz * stack_depth;
|
||||
const int64_t dst_kh_sz = src_kh_sz * num_stacks;
|
||||
const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
|
||||
|
||||
float* const dst_weight_ptr = v_weight_payload.get();
|
||||
memset(dst_weight_ptr, 0, v_weight.nbytes());
|
||||
|
||||
for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
|
||||
/* Source */
|
||||
const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
|
||||
|
||||
/* Destination */
|
||||
const int64_t dst_oh = src_oc / 4;
|
||||
const int64_t dst_c = src_oc % 4;
|
||||
|
||||
float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
|
||||
|
||||
for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
|
||||
const int64_t dst_ic4 = src_ic/4;
|
||||
for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
|
||||
for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
|
||||
memcpy(
|
||||
dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
|
||||
dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
|
||||
src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
|
||||
sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v_weight;
|
||||
}
|
||||
|
||||
vTensor pack_weights(
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight_arg,
|
||||
const int64_t groups) {
|
||||
if (is_depthwise(weight_arg.sizes(), groups)) {
|
||||
return pack_weights_dw(pool, weight_arg, groups);
|
||||
if (is_depthwise(weight.sizes(), groups)) {
|
||||
return pack_weights_dw(
|
||||
context,
|
||||
command_buffer,
|
||||
pool,
|
||||
weight);
|
||||
}
|
||||
|
||||
if (Experimentation::kUseConv2dOldApi) {
|
||||
return pack_weights_old(pool, weight_arg, groups);
|
||||
return pack_weights_2d_old(
|
||||
context,
|
||||
command_buffer,
|
||||
pool,
|
||||
weight);
|
||||
}
|
||||
return pack_weights_2d(pool, weight_arg, groups);
|
||||
|
||||
return pack_weights_2d(
|
||||
context,
|
||||
command_buffer,
|
||||
pool,
|
||||
weight);
|
||||
}
|
||||
|
||||
vTensor pack_biases(
|
||||
|
|
@ -304,8 +318,11 @@ vTensor pack_biases(
|
|||
return convert(*bias);
|
||||
}
|
||||
|
||||
api::Context* const context = api::context();
|
||||
api::Command::Buffer& command_buffer = context->command().pool.stream();
|
||||
|
||||
vTensor v_bias{
|
||||
api::context(),
|
||||
context,
|
||||
&pool,
|
||||
{
|
||||
// 1D
|
||||
|
|
@ -316,7 +333,7 @@ vTensor pack_biases(
|
|||
|
||||
{
|
||||
using Future = vTensor::Future<void, vTensor::Access::Write>;
|
||||
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
|
||||
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>(command_buffer);
|
||||
Future::Payload v_bias_payload = v_bias_future.wait();
|
||||
|
||||
if (bias) {
|
||||
|
|
@ -394,7 +411,8 @@ bool available(
|
|||
(c10::DeviceType::Vulkan == bias->device().type())) &&
|
||||
(kFloat == bias->scalar_type()) &&
|
||||
(transposed ? false /* to be addded in the future */
|
||||
: (weight.size(Layout::Filter::output) == bias->size(Layout::Filter::output))))
|
||||
: (weight.size(Layout::Filter::output) ==
|
||||
bias->size(Layout::Filter::output))))
|
||||
: true) &&
|
||||
// Stride
|
||||
(stride[Layout::Parameter::height] > 0) &&
|
||||
|
|
@ -432,7 +450,7 @@ bool usable(const Tensor& input) {
|
|||
true;
|
||||
}
|
||||
|
||||
void conv2d_depthwise(
|
||||
void conv2d_dw(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
vTensor& v_output,
|
||||
|
|
@ -446,27 +464,39 @@ void conv2d_depthwise(
|
|||
const IntArrayRef dilation,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const struct {
|
||||
int32_t kernel_x, kernel_y;
|
||||
int32_t stride_x, stride_y;
|
||||
int32_t padding_x, padding_y;
|
||||
int32_t dilate_x, dilate_y;
|
||||
float clamp_x, clamp_y;
|
||||
int32_t src_filter_w, src_filter_h;
|
||||
if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const struct Block final {
|
||||
ivec2 kernel;
|
||||
ivec2 stride;
|
||||
ivec2 padding;
|
||||
ivec2 dilate;
|
||||
vec2 clamp;
|
||||
ivec2 src_filter;
|
||||
} block {
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
|
||||
output_min,
|
||||
output_max,
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
|
||||
{
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::height]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
output_min,
|
||||
output_max,
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
|
||||
},
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
|
|
@ -510,7 +540,7 @@ void conv2d_depthwise(
|
|||
}
|
||||
}
|
||||
|
||||
void conv2d_pointwise(
|
||||
void conv2d_pw(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
vTensor& v_output,
|
||||
|
|
@ -522,22 +552,29 @@ void conv2d_pointwise(
|
|||
const IntArrayRef padding,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
|
||||
const struct {
|
||||
int32_t kernel_ic, kernel_oc;
|
||||
int32_t stride_x, stride_y;
|
||||
int32_t padding_x, padding_y;
|
||||
float clamp_x, clamp_y;
|
||||
if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const struct Block final {
|
||||
ivec2 kernel;
|
||||
ivec2 stride;
|
||||
ivec2 padding;
|
||||
vec2 clamp;
|
||||
} block {
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::input]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::output]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
output_min,
|
||||
output_max,
|
||||
{
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::input]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::output]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
output_min,
|
||||
output_max,
|
||||
},
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
|
|
@ -595,30 +632,43 @@ void conv2d(
|
|||
const IntArrayRef dilation,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const struct {
|
||||
int32_t kernel_x, kernel_y, kernel_ic, kernel_oc;
|
||||
int32_t stride_x, stride_y;
|
||||
int32_t padding_x, padding_y;
|
||||
int32_t dilate_x, dilate_y;
|
||||
float clamp_x, clamp_y;
|
||||
int32_t src_filter_w, src_filter_h, src_filter_w4;
|
||||
if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const struct Block final {
|
||||
ivec4 kernel;
|
||||
ivec2 stride;
|
||||
ivec2 padding;
|
||||
ivec2 dilate;
|
||||
vec2 clamp;
|
||||
ivec4 src_filter;
|
||||
} block {
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::input]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::output]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
|
||||
output_min,
|
||||
output_max,
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::width]*4),
|
||||
{
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::input]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::output]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
|
||||
},
|
||||
{
|
||||
output_min,
|
||||
output_max,
|
||||
},
|
||||
{
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(src_filter[Layout::Filter::width] * 4),
|
||||
0,
|
||||
},
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
|
|
@ -662,6 +712,98 @@ void conv2d(
|
|||
}
|
||||
}
|
||||
|
||||
void conv2d_old(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
vTensor& v_output,
|
||||
const vTensor& v_input,
|
||||
const vTensor& v_weight,
|
||||
const vTensor& v_bias,
|
||||
const IntArrayRef filter,
|
||||
const IntArrayRef stride,
|
||||
const IntArrayRef padding,
|
||||
const IntArrayRef dilation,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
using namespace api::utils;
|
||||
|
||||
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const int32_t W = v_input.extents().data[0];
|
||||
const int32_t H = v_input.extents().data[1];
|
||||
const int32_t C_4 = v_input.extents().data[2];
|
||||
const int32_t C = 4 * C_4;
|
||||
|
||||
const int32_t OW = v_output.extents().data[0];
|
||||
const int32_t OH = v_output.extents().data[1];
|
||||
const int32_t OC_4 = v_output.extents().data[2];
|
||||
const int32_t OC = 4 * OC_4;
|
||||
|
||||
const struct Block final {
|
||||
int32_t padding_x, padding_y;
|
||||
int32_t kernel_x, kernel_y;
|
||||
int32_t stride_x, stride_y;
|
||||
int32_t dilate_x, dilate_y;
|
||||
int32_t outputSize[4];
|
||||
int32_t inputSize[4];
|
||||
float outputMin;
|
||||
float outputMax;
|
||||
} block {
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
|
||||
{ OW, OH, OC_4, OC },
|
||||
{ W, H, C_4, C },
|
||||
output_min,
|
||||
output_max,
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
VK_KERNEL(conv2d_nogroup_clamp),
|
||||
//VK_KERNEL(conv2d_nogroup_clamp_1x),
|
||||
v_output.extents(),
|
||||
// Write-only access bypasses synchronization but inserts appropriate
|
||||
// barriers if necessary.
|
||||
v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Write),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_input.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_weight.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_bias.buffer(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor convolution(
|
||||
const Tensor& input,
|
||||
const Tensor& weight,
|
||||
|
|
@ -781,99 +923,6 @@ Conv2dOpContext Conv2dOpContext::create(
|
|||
};
|
||||
}
|
||||
|
||||
void conv2d_old(
|
||||
api::Context* const context,
|
||||
api::Command::Buffer& command_buffer,
|
||||
vTensor& v_output,
|
||||
const vTensor& v_input,
|
||||
const vTensor& v_weight,
|
||||
const vTensor& v_bias,
|
||||
const IntArrayRef filter,
|
||||
const IntArrayRef stride,
|
||||
const IntArrayRef padding,
|
||||
const IntArrayRef dilation,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
|
||||
using namespace api::utils;
|
||||
|
||||
if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
|
||||
const int32_t W = v_input.extents().data[0];
|
||||
const int32_t H = v_input.extents().data[1];
|
||||
const int32_t C_4 = v_input.extents().data[2];
|
||||
const int32_t C = 4 * C_4;
|
||||
|
||||
const int32_t OW = v_output.extents().data[0];
|
||||
const int32_t OH = v_output.extents().data[1];
|
||||
const int32_t OC_4 = v_output.extents().data[2];
|
||||
const int32_t OC = 4 * OC_4;
|
||||
|
||||
const struct {
|
||||
int32_t padding_x, padding_y;
|
||||
int32_t kernel_x, kernel_y;
|
||||
int32_t stride_x, stride_y;
|
||||
int32_t dilate_x, dilate_y;
|
||||
int32_t outputSize[4];
|
||||
int32_t inputSize[4];
|
||||
float outputMin;
|
||||
float outputMax;
|
||||
} block {
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(padding[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::width]),
|
||||
safe_downcast<int32_t>(filter[Layout::Filter::height]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(stride[Layout::Parameter::height]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
|
||||
safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
|
||||
{ OW, OH, OC_4, OC },
|
||||
{ W, H, C_4, C },
|
||||
output_min,
|
||||
output_max,
|
||||
};
|
||||
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
VK_KERNEL(conv2d_nogroup_clamp),
|
||||
//VK_KERNEL(conv2d_nogroup_clamp_1x),
|
||||
v_output.extents(),
|
||||
// Write-only access bypasses synchronization but inserts appropriate
|
||||
// barriers if necessary.
|
||||
v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Write),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_input.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_weight.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_bias.buffer(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
|
||||
api::Context* const context = api::context();
|
||||
|
||||
|
|
@ -896,11 +945,11 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
|
|||
input.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (is_depthwise(unpacked_.filter, unpacked_.groups)) {
|
||||
conv2d_depthwise(
|
||||
conv2d_dw(
|
||||
context,
|
||||
command_buffer,
|
||||
v_output,
|
||||
|
|
@ -932,7 +981,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
|
|||
packed_.output_max);
|
||||
} else {
|
||||
if (is_pointwise(unpacked_.filter)) {
|
||||
conv2d_pointwise(
|
||||
conv2d_pw(
|
||||
context,
|
||||
command_buffer,
|
||||
v_output,
|
||||
|
|
@ -964,8 +1013,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
|
|||
}
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,87 +6,96 @@ namespace vulkan {
|
|||
namespace ops {
|
||||
|
||||
Tensor& copy_(Tensor& self, const Tensor& src) {
|
||||
// X -> Vulkan
|
||||
if (at::kVulkan == self.device().type()) {
|
||||
vTensor& v_self = convert(self);
|
||||
api::Context* const context = api::context();
|
||||
|
||||
// CPU -> Vulkan
|
||||
if (at::kCPU == src.device().type()) {
|
||||
// Requesting write-only host access to the tensor never triggers a sync
|
||||
// as the contents will be overwritten regardless. Having said that,
|
||||
// appropriate barriers are inserted automatically if WAR or WAW hazards
|
||||
// are detected. Examples of such scenario for instance are if any of
|
||||
// these async operations are on going in the background on 'self':
|
||||
// - On discrete systems:
|
||||
// * buffer-to-staging transfers
|
||||
// * staging-to-buffer transfers
|
||||
// - On UMA buffer is an alias for staging and accessible both on host
|
||||
// and device. Consequently:
|
||||
// * buffer-to-image NHWC -> NC4HW packing
|
||||
// * image-to-buffer NC4HW -> NHWC unpacking
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
// X -> Vulkan
|
||||
if (at::kVulkan == self.device().type()) {
|
||||
vTensor& v_self = convert(self);
|
||||
|
||||
using Future = vTensor::Future<void, vTensor::Access::Write>;
|
||||
Future v_self_future = v_self.host<void, vTensor::Access::Write>();
|
||||
// Vulkan -> Vulkan
|
||||
if (at::kVulkan == src.device().type()) {
|
||||
command_buffer.copy(
|
||||
// - Read-only access is implied on const tensors. Memory barriers
|
||||
// are automatically inserted if a RAW hazard is detected.
|
||||
// - Recording any potential pending sync operations into the same
|
||||
// command buffer prevents an expensive queue submission.
|
||||
convert(src).buffer(
|
||||
command_buffer,
|
||||
vTensor::Stage::Transfer),
|
||||
// - Write-only access never triggers a sync as the contents will be
|
||||
// overwritten regardless. Having said that, appropriate barriers
|
||||
// are inserted automatically if WAR or WAW hazards are detected.
|
||||
// - Recording pending sync operations into the same command buffer
|
||||
// prevents an expensive queue submission.
|
||||
v_self.buffer(
|
||||
command_buffer,
|
||||
vTensor::Stage::Transfer,
|
||||
vTensor::Access::Write));
|
||||
|
||||
// This wait() will be a no-op if no hazards are detected, including the
|
||||
// obvious, yet important, special case of 'self' being an empty tensor.
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
}
|
||||
// CPU -> Vulkan
|
||||
else {
|
||||
const Tensor cpu_src = src.device().is_cpu() ? src : src.cpu();
|
||||
|
||||
Future::Payload v_self_payload = v_self_future.wait();
|
||||
// Requesting write-only host access to the tensor never triggers a sync
|
||||
// as the contents will be overwritten regardless. Having said that,
|
||||
// appropriate barriers are inserted automatically if WAR or WAW hazards
|
||||
// are detected. Examples of such scenario for instance are if any of
|
||||
// these async operations are on going in the background on 'self':
|
||||
// - On discrete systems:
|
||||
// * buffer-to-staging transfers
|
||||
// * staging-to-buffer transfers
|
||||
// - On UMA buffer is an alias for staging and accessible both on host
|
||||
// and device. Consequently:
|
||||
// * buffer-to-image NHWC -> NC4HW packing
|
||||
// * image-to-buffer NC4HW -> NHWC unpacking
|
||||
|
||||
memcpy(
|
||||
v_self_payload.get(),
|
||||
src.contiguous().data_ptr<float>(),
|
||||
std::min(src.nbytes(), self.nbytes()));
|
||||
using Future = vTensor::Future<void, vTensor::Access::Write>;
|
||||
Future v_self_future = v_self.host<void, vTensor::Access::Write>(command_buffer);
|
||||
|
||||
// Ideally we would have been able to put as much distance between
|
||||
// requesting the data - a call to host() - and accessing the data
|
||||
// - a call to wait() - but a local view of the computation graph
|
||||
// in eager mode makes that optimization non-trivial.
|
||||
|
||||
// This wait() will be a no-op if no hazards are detected, including the
|
||||
// obvious, yet important, special case of 'self' being an empty tensor.
|
||||
|
||||
Future::Payload v_self_payload = v_self_future.wait();
|
||||
|
||||
memcpy(
|
||||
v_self_payload.get(),
|
||||
cpu_src.contiguous().data_ptr<float>(),
|
||||
std::min(src.nbytes(), self.nbytes()));
|
||||
}
|
||||
}
|
||||
// Vulkan -> Vulkan
|
||||
// Vulkan -> X
|
||||
else if (at::kVulkan == src.device().type()) {
|
||||
api::Command::Buffer command_buffer = api::context()->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
|
||||
command_buffer.copy(
|
||||
// - Read-only access is implied on const tensors. Memory barriers
|
||||
// are automatically inserted if a RAW hazard is detected.
|
||||
// - Recording any potential pending sync operations into the same
|
||||
// command buffer prevents an expensive queue submission.
|
||||
convert(src).buffer(
|
||||
command_buffer,
|
||||
vTensor::Stage::Transfer),
|
||||
// - Write-only access never triggers a sync as the contents will be
|
||||
// overwritten regardless. Having said that, appropriate barriers
|
||||
// are inserted automatically if WAR or WAW hazards are detected.
|
||||
// - Recording pending sync operations into the same command buffer
|
||||
// prevents an expensive queue submission.
|
||||
v_self.buffer(
|
||||
command_buffer,
|
||||
vTensor::Stage::Transfer,
|
||||
vTensor::Access::Write));
|
||||
|
||||
command_buffer.end();
|
||||
command_buffer.submit(api::context()->gpu().queue);
|
||||
}
|
||||
else {
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
|
||||
}
|
||||
}
|
||||
// Vulkan -> X
|
||||
else if (at::kVulkan == src.device().type()) {
|
||||
const vTensor& v_src = convert(src);
|
||||
|
||||
{
|
||||
// Similar notes as above applies, with the additional consideration of
|
||||
// potential syncs on read accesses. Namely,
|
||||
// - on discrete systems, if the (staging, buffer, image) trio, or
|
||||
// - on UMA, if the (buffer, image) duo
|
||||
// have gone out of sync as a result of one processor writing to one
|
||||
// resource which is then either accessed as an another resource type on
|
||||
// the same or another processor. Same considerations regarding hazard
|
||||
// avoidance as above applies.
|
||||
|
||||
using Future = vTensor::Future<const void, vTensor::Access::Read>;
|
||||
const Future v_src_future = v_src.host<const void>();
|
||||
const vTensor& v_src = convert(src);
|
||||
|
||||
// Vulkan -> CPU
|
||||
if (at::kCPU == self.device().type()) {
|
||||
if (self.device().is_cpu()) {
|
||||
// Similar notes as above applies, with the additional consideration of
|
||||
// potential syncs on read accesses. Namely,
|
||||
// - on discrete systems, if the (staging, buffer, image) trio, or
|
||||
// - on UMA, if the (buffer, image) duo
|
||||
// have gone out of sync as a result of one processor writing to one
|
||||
// resource which is then either accessed as an another resource type on
|
||||
// the same or another processor. Same considerations regarding hazard
|
||||
// avoidance as above applies.
|
||||
|
||||
using Future = vTensor::Future<const void, vTensor::Access::Read>;
|
||||
const Future v_src_future = v_src.host<const void>(command_buffer);
|
||||
|
||||
// Ideally we would have been able to put as much distance between
|
||||
// requesting the data - a call to host() - and accessing the data
|
||||
// - a call to wait() - but a local view of the computation graph
|
||||
// in eager mode makes that optimization non-trivial.
|
||||
|
||||
// This wait() is a no-op if data is not out of sync. More often than
|
||||
// not though, waits here are expected as the GPU catches up with
|
||||
// compute submitted from CPU.
|
||||
|
|
@ -99,51 +108,56 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
|
|||
std::min(src.nbytes(), self.nbytes()));
|
||||
}
|
||||
else {
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
|
||||
TORCH_CHECK(false, "Unsupported!");
|
||||
}
|
||||
|
||||
//
|
||||
// WARNING
|
||||
//
|
||||
|
||||
// This is not great. We almost never want to flush the GPU pipeline as
|
||||
// that has far reaching consequences, especially if PyTorch is not the only
|
||||
// process accessing the GPU. If we have done our job properly, above
|
||||
// synchronization mechanisms should be enough to ensure correctness at a more
|
||||
// modest cost, as there is no need to flush the entirety of jobs in flight
|
||||
// if one is only interested on waiting on computation affecting one single
|
||||
// tensor to finish.
|
||||
//
|
||||
// Having said that, we still do need to release all pool resources at one
|
||||
// point per inference run or we will run out of memory otherwise. There is
|
||||
// no perfect answer to this problem that checks all boxes, which leaves us
|
||||
// with one of several design decisions:
|
||||
//
|
||||
// 1) Use graph mode to gain an understanding of the computation graph,
|
||||
// itself allowing us to place pool purges intelligently. Best option
|
||||
// for performance and memory consumption. Not without its downsides if
|
||||
// flexibility is a top priority.
|
||||
// 2) If on eager mode, and hence are seeing operations one at a time, expose
|
||||
// this release of resources to the user as a Python / C++ function. This
|
||||
// makes for suboptimal user experience but is efficient in terms of
|
||||
// performance.
|
||||
// 3) If on eager mode, and interested in keeping this bookkeeping transparent
|
||||
// to the user, release all resources somewhere ... like here. This is
|
||||
// not ideal since it requires a pipeline flush to make sure these objects
|
||||
// are not already in use by a workload in flight. Cannot do much better
|
||||
// within the constraints of this approach. Good for user experience,
|
||||
// suboptimal for performance.
|
||||
// 4) If on eager mode, and interested in keeping this bookkeeping transparent
|
||||
// to the user, and performance does not matter, make CPU and GPU run in
|
||||
// lockstep. Obviously this is just bad. Mentioned for the sake of
|
||||
// completeness.
|
||||
|
||||
context->flush();
|
||||
}
|
||||
else {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false,
|
||||
"Invalid code path taken! Either the source or the destination tensor "
|
||||
"was expected to be Vulkan a tensor! Incorrect dispatch?");
|
||||
}
|
||||
|
||||
//
|
||||
// WARNING
|
||||
//
|
||||
|
||||
// This is not great. We almost never want to flush the GPU pipeline as
|
||||
// that has far reaching consequences, especially if PyTorch is not the only
|
||||
// process accessing the GPU. If we have done our job properly, above
|
||||
// synchronization mechanisms should be enough to ensure correctness at a more
|
||||
// modest cost, as there is no need to flush the entirety of jobs in flight
|
||||
// if one is only interested on waiting on computation affecting one single
|
||||
// tensor to finish.
|
||||
//
|
||||
// Having said that, we still do need to release all pool resources at one
|
||||
// point per inference run or we will run out of memory otherwise. There is
|
||||
// no perfect answer to this problem that checks all boxes, which leaves us
|
||||
// with one of several design decisions:
|
||||
//
|
||||
// 1) Use graph mode to gain an understanding of the computation graph,
|
||||
// itself allowing us to place pool purges intelligently. Best option
|
||||
// for performance and memory consumption. Not without its downsides if
|
||||
// flexibility is a top priority.
|
||||
// 2) If on eager mode, and hence are seeing operations one at a time, expose
|
||||
// this release of resources to the user as a Python / C++ function. This
|
||||
// makes for suboptimal user experience but is efficient in terms of
|
||||
// performance.
|
||||
// 3) If on eager mode, and interested in keeping this bookkeeping transparent
|
||||
// to the user, release all resources somewhere ... like here. This is
|
||||
// not ideal since it requires a pipeline flush to make sure these objects
|
||||
// are not already in use by a workload in flight. Cannot do much better
|
||||
// within the constraints of this approach. Good for user experience,
|
||||
// suboptimal for performance.
|
||||
// 4) If on eager mode, and interested in keeping this bookkeeping transparent
|
||||
// to the user, and performance does not matter, make CPU and GPU run in
|
||||
// lockstep. Obviously this is just bad. Mentioned for the sake of
|
||||
// completeness.
|
||||
|
||||
api::context()->flush();
|
||||
}
|
||||
else {
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported!");
|
||||
}
|
||||
// No queue submission here. All queue submissions must have been handled
|
||||
// above either explicitly or as a result of calling tensor.host().
|
||||
|
||||
return self;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,11 +52,11 @@ Tensor mean(
|
|||
v_input.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_input.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_input.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
int32_t range;
|
||||
ivec2 iextents;
|
||||
|
|
@ -71,63 +71,35 @@ Tensor mean(
|
|||
},
|
||||
};
|
||||
|
||||
if (keepdim) {
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
VK_KERNEL(mean),
|
||||
v_output.extents(),
|
||||
// Write-only access bypasses synchronization but inserts appropriate
|
||||
// barriers if necessary.
|
||||
v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Write),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_input.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
else {
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
VK_KERNEL(mean2d),
|
||||
v_output.extents(),
|
||||
// Write-only access bypasses synchronization but inserts appropriate
|
||||
// barriers if necessary.
|
||||
v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Write),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_input.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
context->dispatch(
|
||||
command_buffer,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
},
|
||||
keepdim ? VK_KERNEL(mean) : VK_KERNEL(mean2d),
|
||||
v_output.extents(),
|
||||
// Write-only access bypasses synchronization but inserts appropriate
|
||||
// barriers if necessary.
|
||||
v_output.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute,
|
||||
vTensor::Access::Write),
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
// synchronization if necessary.
|
||||
v_input.image(
|
||||
command_buffer,
|
||||
vTensor::Stage::Compute),
|
||||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,18 +10,21 @@ namespace {
|
|||
using namespace api::utils;
|
||||
|
||||
vTensor pack_weights(
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight_arg) {
|
||||
api::Resource::Pool& pool,
|
||||
const Tensor& weight_arg) {
|
||||
if (weight_arg.is_vulkan()) {
|
||||
return convert(weight_arg);
|
||||
}
|
||||
|
||||
api::Context* const context = api::context();
|
||||
api::Command::Buffer& command_buffer = context->command().pool.stream();
|
||||
|
||||
const Tensor weight = weight_arg.contiguous();
|
||||
const IntArrayRef w_sizes = weight.sizes();
|
||||
const float* const src_weight_ptr = weight.data_ptr<float>();
|
||||
|
||||
vTensor v_weight{
|
||||
api::context(),
|
||||
context,
|
||||
&pool,
|
||||
w_sizes,
|
||||
weight.options(),
|
||||
|
|
@ -29,7 +32,7 @@ vTensor pack_weights(
|
|||
|
||||
{
|
||||
using Future = vTensor::Future<void, vTensor::Access::Write>;
|
||||
Future v_weight_future = v_weight.host<void, vTensor::Access::Write>();
|
||||
Future v_weight_future = v_weight.host<void, vTensor::Access::Write>(command_buffer);
|
||||
Future::Payload v_weight_payload = v_weight_future.wait();
|
||||
|
||||
memcpy(
|
||||
|
|
@ -49,16 +52,21 @@ vTensor pack_biases(
|
|||
return convert(*bias_arg);
|
||||
}
|
||||
|
||||
api::Context* const context = api::context();
|
||||
api::Command::Buffer& command_buffer = context->command().pool.stream();
|
||||
|
||||
vTensor v_bias{
|
||||
api::context(),
|
||||
context,
|
||||
&pool,
|
||||
{weight_arg.sizes()[Layout::Parameter::width]},
|
||||
{
|
||||
weight_arg.size(Layout::Parameter::width),
|
||||
},
|
||||
weight_arg.options(),
|
||||
};
|
||||
|
||||
{
|
||||
using Future = vTensor::Future<void, vTensor::Access::Write>;
|
||||
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
|
||||
Future v_bias_future = v_bias.host<void, vTensor::Access::Write>(command_buffer);
|
||||
Future::Payload v_bias_payload = v_bias_future.wait();
|
||||
|
||||
if (bias_arg) {
|
||||
|
|
@ -66,7 +74,8 @@ vTensor pack_biases(
|
|||
v_bias_payload.get(),
|
||||
bias_arg->contiguous().data_ptr<float>(),
|
||||
std::min(bias_arg->nbytes(), v_bias.nbytes()));
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
memset(
|
||||
v_bias_payload.get(),
|
||||
// 2's complement integers and IEEE-754 floating point numbers both
|
||||
|
|
@ -162,11 +171,11 @@ Tensor mm(
|
|||
mat1.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_mat1.has_image() && v_mat2.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_mat1.has_image() && v_mat2.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 size;
|
||||
int32_t K;
|
||||
} block {
|
||||
|
|
@ -203,12 +212,12 @@ Tensor mm(
|
|||
// Object lifetime is managed by the resource pool.
|
||||
// It is OK not to keep track of the handle.
|
||||
context->resource().pool.uniform(block).object);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
@ -281,14 +290,15 @@ Tensor LinearOpContext::run(
|
|||
input.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_output.has_image() &&
|
||||
if C10_LIKELY(
|
||||
v_output.has_image() &&
|
||||
v_input.has_image() &&
|
||||
packed_.v_weight.has_image() &&
|
||||
packed_.v_bias.has_image()) {
|
||||
const struct {
|
||||
const struct Block final {
|
||||
uvec3 size;
|
||||
int32_t K;
|
||||
vec2 multiplier;
|
||||
|
|
@ -341,8 +351,7 @@ Tensor LinearOpContext::run(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,11 +23,11 @@ Tensor mul_scalar(
|
|||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_output.has_image() && v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
float other;
|
||||
} block {
|
||||
|
|
@ -63,8 +63,7 @@ Tensor mul_scalar(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
@ -80,11 +79,11 @@ Tensor& mul_scalar_(
|
|||
|
||||
vTensor& v_self = convert(self);
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
float other;
|
||||
} block {
|
||||
|
|
@ -114,8 +113,7 @@ Tensor& mul_scalar_(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return self;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,10 +33,10 @@ Tensor adaptive_avg_pool2d(
|
|||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_self.has_image()) {
|
||||
if C10_LIKELY(v_self.has_image()) {
|
||||
const uvec3 v_output_size = v_output.extents();
|
||||
const uvec3 v_self_size = v_self.extents();
|
||||
|
||||
|
|
@ -45,7 +45,7 @@ Tensor adaptive_avg_pool2d(
|
|||
static_cast<float>(v_self_size.data[1u]) / v_output_size.data[1u],
|
||||
};
|
||||
|
||||
const struct {
|
||||
const struct Block final {
|
||||
uvec3 size;
|
||||
uint32_t _;
|
||||
vec2 stride;
|
||||
|
|
@ -88,8 +88,7 @@ Tensor adaptive_avg_pool2d(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
@ -171,13 +170,11 @@ Tensor avg_pool2d(
|
|||
v_self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
using namespace utils;
|
||||
|
||||
if (v_self.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_self.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
int32_t range;
|
||||
ivec2 iextents;
|
||||
|
|
@ -235,8 +232,7 @@ Tensor avg_pool2d(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,8 +21,8 @@ Tensor view(
|
|||
self.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
command_buffer.copy(
|
||||
// Read-only access is implied on const tensors and triggers an async
|
||||
|
|
@ -37,8 +37,7 @@ Tensor view(
|
|||
vTensor::Stage::Transfer,
|
||||
vTensor::Access::Write));
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -419,31 +419,19 @@ vTensor::vTensor(
|
|||
}) {
|
||||
}
|
||||
|
||||
const vTensor* vTensor::host() const {
|
||||
view_->staging(Stage::Host, Access::Read);
|
||||
const vTensor* vTensor::host(
|
||||
api::Command::Buffer& command_buffer) const {
|
||||
view_->staging(command_buffer, Stage::Host, Access::Read);
|
||||
return this;
|
||||
}
|
||||
|
||||
vTensor* vTensor::host(const Access::Flags access) {
|
||||
view_->staging(Stage::Host, access);
|
||||
vTensor* vTensor::host(
|
||||
api::Command::Buffer& command_buffer,
|
||||
const Access::Flags access) {
|
||||
view_->staging(command_buffer, Stage::Host, access);
|
||||
return this;
|
||||
}
|
||||
|
||||
vTensor::Buffer::Object vTensor::buffer(
|
||||
const Stage::Flags stage) const & {
|
||||
return view_->buffer(
|
||||
stage,
|
||||
Access::Read).object;
|
||||
}
|
||||
|
||||
vTensor::Buffer::Object vTensor::buffer(
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) & {
|
||||
return view_->buffer(
|
||||
stage,
|
||||
access).object;
|
||||
}
|
||||
|
||||
vTensor::Buffer::Object vTensor::buffer(
|
||||
api::Command::Buffer& command_buffer,
|
||||
const Stage::Flags stage) const & {
|
||||
|
|
@ -463,21 +451,6 @@ vTensor::Buffer::Object vTensor::buffer(
|
|||
access).object;
|
||||
}
|
||||
|
||||
vTensor::Image::Object vTensor::image(
|
||||
const Stage::Flags stage) const & {
|
||||
return view_->image(
|
||||
stage,
|
||||
Access::Read).object;
|
||||
}
|
||||
|
||||
vTensor::Image::Object vTensor::image(
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) & {
|
||||
return view_->image(
|
||||
stage,
|
||||
access).object;
|
||||
}
|
||||
|
||||
vTensor::Image::Object vTensor::image(
|
||||
api::Command::Buffer& command_buffer,
|
||||
const Stage::Flags stage) const & {
|
||||
|
|
@ -535,16 +508,8 @@ vTensor::View::View(
|
|||
ops::verify(options);
|
||||
}
|
||||
|
||||
// We typically do not know whether we need a command buffer to service a request
|
||||
// until we have perfomed a bunch of checks in nested logic, and even then we
|
||||
// may end up with the always issued state transition optimized away under
|
||||
// certain conditions, which makes a policy of always allocating a command buffer
|
||||
// up front, only to end up using it at times, a wasteful approach. This class
|
||||
// answers that need.
|
||||
|
||||
class vTensor::View::CMD final {
|
||||
public:
|
||||
explicit CMD(const View&);
|
||||
CMD(const View&, api::Command::Buffer&);
|
||||
CMD(const CMD&) = delete;
|
||||
CMD& operator=(const CMD&) = delete;
|
||||
|
|
@ -578,60 +543,18 @@ class vTensor::View::CMD final {
|
|||
const Image::Object& image,
|
||||
Buffer::Object& buffer);
|
||||
|
||||
void submit(Fence fence = {});
|
||||
|
||||
private:
|
||||
api::Command::Buffer& command_buffer();
|
||||
void submit(Fence fence);
|
||||
|
||||
private:
|
||||
const View& view_;
|
||||
|
||||
enum class Type {
|
||||
Internal,
|
||||
External,
|
||||
} type;
|
||||
|
||||
union _ final {
|
||||
api::Command::Buffer internal;
|
||||
api::Command::Buffer* external;
|
||||
~_() {}
|
||||
} command_buffer_;
|
||||
api::Command::Buffer& command_buffer_;
|
||||
};
|
||||
|
||||
vTensor::View::CMD::CMD(
|
||||
const View& view)
|
||||
: view_(view),
|
||||
type(Type::Internal),
|
||||
command_buffer_{} {
|
||||
}
|
||||
|
||||
vTensor::View::CMD::CMD(
|
||||
const View& view,
|
||||
api::Command::Buffer& external)
|
||||
api::Command::Buffer& command_buffer)
|
||||
: view_(view),
|
||||
type(Type::External),
|
||||
command_buffer_{
|
||||
.external = &external,
|
||||
} {
|
||||
}
|
||||
|
||||
api::Command::Buffer& vTensor::View::CMD::command_buffer() {
|
||||
switch (type) {
|
||||
case Type::Internal:
|
||||
if (!command_buffer_.internal) {
|
||||
command_buffer_.internal = view_.context_->command().pool.allocate();
|
||||
command_buffer_.internal.begin();
|
||||
}
|
||||
|
||||
return command_buffer_.internal;
|
||||
|
||||
case Type::External:
|
||||
return *(command_buffer_.external);
|
||||
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false, "Unknown command buffer type!");
|
||||
break;
|
||||
}
|
||||
command_buffer_(command_buffer) {
|
||||
}
|
||||
|
||||
void vTensor::View::CMD::barrier(State::Transition transition) {
|
||||
|
|
@ -761,7 +684,7 @@ void vTensor::View::CMD::barrier(State::Transition transition) {
|
|||
barrier.stage.src = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
|
||||
}
|
||||
|
||||
command_buffer().barrier(barrier);
|
||||
command_buffer_.barrier(barrier);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -789,7 +712,7 @@ void vTensor::View::CMD::copy_buffer_to_staging(
|
|||
{},
|
||||
}));
|
||||
|
||||
command_buffer().copy(buffer, staging);
|
||||
command_buffer_.copy(buffer, staging);
|
||||
}
|
||||
|
||||
void vTensor::View::CMD::copy_staging_to_buffer(
|
||||
|
|
@ -816,7 +739,7 @@ void vTensor::View::CMD::copy_staging_to_buffer(
|
|||
{},
|
||||
}));
|
||||
|
||||
command_buffer().copy(staging, buffer);
|
||||
command_buffer_.copy(staging, buffer);
|
||||
}
|
||||
|
||||
void vTensor::View::CMD::copy_buffer_to_image(
|
||||
|
|
@ -847,7 +770,7 @@ void vTensor::View::CMD::copy_buffer_to_image(
|
|||
const uvec3 extents = view_.extents();
|
||||
const uint32_t plane = extents.data[0u] * extents.data[1u];
|
||||
|
||||
const struct {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t block;
|
||||
uvec4 offset;
|
||||
|
|
@ -863,7 +786,7 @@ void vTensor::View::CMD::copy_buffer_to_image(
|
|||
};
|
||||
|
||||
view_.context_->dispatch(
|
||||
command_buffer(),
|
||||
command_buffer_,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
|
|
@ -904,7 +827,7 @@ void vTensor::View::CMD::copy_image_to_buffer(
|
|||
const uvec3 extents = view_.extents();
|
||||
const uint32_t plane = extents.data[0u] * extents.data[1u];
|
||||
|
||||
const struct {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t block;
|
||||
uvec4 offset;
|
||||
|
|
@ -920,7 +843,7 @@ void vTensor::View::CMD::copy_image_to_buffer(
|
|||
};
|
||||
|
||||
view_.context_->dispatch(
|
||||
command_buffer(),
|
||||
command_buffer_,
|
||||
{
|
||||
VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
|
||||
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
|
|
@ -934,10 +857,10 @@ void vTensor::View::CMD::copy_image_to_buffer(
|
|||
}
|
||||
|
||||
void vTensor::View::CMD::submit(const api::Resource::Fence fence) {
|
||||
if ((Type::Internal == type) && command_buffer_.internal) {
|
||||
command_buffer_.internal.end();
|
||||
command_buffer_.internal.submit(view_.context_->gpu().queue, fence);
|
||||
}
|
||||
view_.context_->command().pool.submit(
|
||||
view_.context_->gpu().queue,
|
||||
command_buffer_,
|
||||
fence);
|
||||
}
|
||||
|
||||
vTensor::Buffer& vTensor::View::buffer() const {
|
||||
|
|
@ -953,38 +876,28 @@ vTensor::Buffer& vTensor::View::buffer() const {
|
|||
}
|
||||
|
||||
vTensor::Buffer& vTensor::View::buffer(
|
||||
api::Command::Buffer& command_buffer,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
CMD command_buffer(*this);
|
||||
Buffer& buffer = this->buffer(command_buffer, stage, access);
|
||||
command_buffer.submit();
|
||||
|
||||
return buffer;
|
||||
CMD cmd(*this, command_buffer);
|
||||
return buffer(cmd, stage, access);
|
||||
}
|
||||
|
||||
vTensor::Buffer& vTensor::View::buffer(
|
||||
api::Command::Buffer& command_buffer_,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
CMD command_buffer(*this, command_buffer_);
|
||||
return buffer(command_buffer, stage, access);
|
||||
}
|
||||
|
||||
vTensor::Buffer& vTensor::View::buffer(
|
||||
CMD& command_buffer,
|
||||
CMD& cmd,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
if ((access & Access::Read) && state_.is_dirty(Component::Buffer)) {
|
||||
if (state_.is_clean(Component::Staging)) {
|
||||
command_buffer.copy_staging_to_buffer(
|
||||
cmd.copy_staging_to_buffer(
|
||||
state_,
|
||||
staging(command_buffer, Stage::Transfer, Access::Read).object,
|
||||
staging(cmd, Stage::Transfer, Access::Read).object,
|
||||
buffer().object);
|
||||
}
|
||||
else if (state_.is_clean(Component::Image)) {
|
||||
command_buffer.copy_image_to_buffer(
|
||||
cmd.copy_image_to_buffer(
|
||||
state_,
|
||||
image(command_buffer, Stage::Compute, Access::Read).object,
|
||||
image(cmd, Stage::Compute, Access::Read).object,
|
||||
buffer().object);
|
||||
}
|
||||
else {
|
||||
|
|
@ -994,7 +907,7 @@ vTensor::Buffer& vTensor::View::buffer(
|
|||
}
|
||||
}
|
||||
|
||||
command_buffer.barrier(
|
||||
cmd.barrier(
|
||||
state_.transition({
|
||||
// Staging
|
||||
{},
|
||||
|
|
@ -1028,35 +941,25 @@ vTensor::Image& vTensor::View::image() const {
|
|||
}
|
||||
|
||||
vTensor::Image& vTensor::View::image(
|
||||
api::Command::Buffer& command_buffer,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
CMD command_buffer(*this);
|
||||
Image& image = this->image(command_buffer, stage, access);
|
||||
command_buffer.submit();
|
||||
|
||||
return image;
|
||||
CMD cmd(*this, command_buffer);
|
||||
return image(cmd, stage, access);
|
||||
}
|
||||
|
||||
vTensor::Image& vTensor::View::image(
|
||||
api::Command::Buffer& command_buffer_,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
CMD command_buffer(*this, command_buffer_);
|
||||
return image(command_buffer, stage, access);
|
||||
}
|
||||
|
||||
vTensor::Image& vTensor::View::image(
|
||||
CMD& command_buffer,
|
||||
CMD& cmd,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
if ((access & Access::Read) && state_.is_dirty(Component::Image)) {
|
||||
command_buffer.copy_buffer_to_image(
|
||||
cmd.copy_buffer_to_image(
|
||||
state_,
|
||||
buffer(command_buffer, stage, Access::Read).object,
|
||||
buffer(cmd, stage, Access::Read).object,
|
||||
image().object);
|
||||
}
|
||||
|
||||
command_buffer.barrier(
|
||||
cmd.barrier(
|
||||
state_.transition({
|
||||
// Staging
|
||||
{},
|
||||
|
|
@ -1096,27 +999,28 @@ vTensor::Buffer& vTensor::View::staging() const {
|
|||
}
|
||||
|
||||
vTensor::Buffer& vTensor::View::staging(
|
||||
api::Command::Buffer& command_buffer,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
CMD command_buffer(*this);
|
||||
Buffer& staging = this->staging(command_buffer, stage, access);
|
||||
command_buffer.submit(fence());
|
||||
CMD cmd(*this, command_buffer);
|
||||
Buffer& staging = this->staging(cmd, stage, access);
|
||||
cmd.submit(fence(access));
|
||||
|
||||
return staging;
|
||||
}
|
||||
|
||||
vTensor::Buffer& vTensor::View::staging(
|
||||
CMD& command_buffer,
|
||||
CMD& cmd,
|
||||
const Stage::Flags stage,
|
||||
const Access::Flags access) const {
|
||||
if ((access & Access::Read) && state_.is_dirty(Component::Staging)) {
|
||||
command_buffer.copy_buffer_to_staging(
|
||||
cmd.copy_buffer_to_staging(
|
||||
state_,
|
||||
buffer(command_buffer, Stage::Transfer, Access::Read).object,
|
||||
buffer(cmd, Stage::Transfer, Access::Read).object,
|
||||
staging().object);
|
||||
}
|
||||
|
||||
command_buffer.barrier(
|
||||
cmd.barrier(
|
||||
state_.transition({
|
||||
// Staging
|
||||
{
|
||||
|
|
@ -1138,6 +1042,14 @@ vTensor::Buffer& vTensor::View::staging(
|
|||
return staging();
|
||||
}
|
||||
|
||||
vTensor::Fence& vTensor::View::fence(const Access::Flags access) const {
|
||||
if (access & Access::Read) {
|
||||
fence_ = allocate_fence(&context_->resource().pool);
|
||||
}
|
||||
|
||||
return fence_;
|
||||
}
|
||||
|
||||
vTensor::Memory& vTensor::View::wait() const {
|
||||
if (fence_) {
|
||||
fence_.wait();
|
||||
|
|
@ -1146,10 +1058,6 @@ vTensor::Memory& vTensor::View::wait() const {
|
|||
return staging().memory;
|
||||
}
|
||||
|
||||
vTensor::Fence& vTensor::View::fence() const {
|
||||
return (fence_ = allocate_fence(pool_));
|
||||
}
|
||||
|
||||
void vTensor::View::verify() const {
|
||||
TORCH_INTERNAL_ASSERT(!image_ || state_.is_available(Component::Image));
|
||||
TORCH_INTERNAL_ASSERT(!staging_ || state_.is_discrete());
|
||||
|
|
|
|||
|
|
@ -157,10 +157,10 @@ class vTensor final {
|
|||
*/
|
||||
|
||||
template<typename Type>
|
||||
Future<Type, Access::Read> host() const &;
|
||||
Future<Type, Access::Read> host(api::Command::Buffer&) const &;
|
||||
|
||||
template<typename Type, Access::Flags kAccess>
|
||||
Future<Type, kAccess> host() &;
|
||||
Future<Type, kAccess> host(api::Command::Buffer&) &;
|
||||
|
||||
/*
|
||||
Device access - these functions will be expensive if they trigger a buffer
|
||||
|
|
@ -178,14 +178,10 @@ class vTensor final {
|
|||
predictability of usage and efficiency.
|
||||
*/
|
||||
|
||||
Buffer::Object buffer(Stage::Flags) const &;
|
||||
Buffer::Object buffer(Stage::Flags, Access::Flags) &;
|
||||
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const &;
|
||||
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) &;
|
||||
|
||||
bool has_image() const;
|
||||
Image::Object image(Stage::Flags) const &;
|
||||
Image::Object image(Stage::Flags, Access::Flags) &;
|
||||
Image::Object image(api::Command::Buffer&, Stage::Flags) const &;
|
||||
Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) &;
|
||||
|
||||
|
|
@ -210,26 +206,22 @@ class vTensor final {
|
|||
Host
|
||||
*/
|
||||
|
||||
const vTensor* host() const;
|
||||
vTensor* host(Access::Flags access);
|
||||
const vTensor* host(api::Command::Buffer&) const;
|
||||
vTensor* host(api::Command::Buffer&, Access::Flags);
|
||||
|
||||
template<typename Type>
|
||||
Future<Type, Access::Read> host() const && = delete;
|
||||
Future<Type, Access::Read> host(api::Command::Buffer&) const && = delete;
|
||||
|
||||
template<typename Type, Access::Flags kAccess>
|
||||
Future<Type, kAccess> host() && = delete;
|
||||
Future<Type, kAccess> host(api::Command::Buffer&) && = delete;
|
||||
|
||||
/*
|
||||
Device
|
||||
*/
|
||||
|
||||
Buffer::Object buffer(Stage::Flags) const && = delete;
|
||||
Buffer::Object buffer(Stage::Flags, Access::Flags) && = delete;
|
||||
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const && = delete;
|
||||
Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;
|
||||
|
||||
Image::Object image(Stage::Flags) const && = delete;
|
||||
Image::Object image(Stage::Flags, Access::Flags) && = delete;
|
||||
Image::Object image(api::Command::Buffer&, Stage::Flags) const && = delete;
|
||||
Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;
|
||||
|
||||
|
|
@ -249,21 +241,22 @@ class vTensor final {
|
|||
~View() = default;
|
||||
|
||||
/*
|
||||
Device
|
||||
Buffer
|
||||
*/
|
||||
|
||||
Buffer& buffer(Stage::Flags, Access::Flags) const;
|
||||
Buffer& buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
|
||||
|
||||
/*
|
||||
Image
|
||||
*/
|
||||
|
||||
bool has_image() const;
|
||||
Image& image(Stage::Flags, Access::Flags) const;
|
||||
Image& image(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
|
||||
|
||||
/*
|
||||
Host
|
||||
*/
|
||||
|
||||
Buffer& staging(Stage::Flags, Access::Flags) const;
|
||||
Buffer& staging(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
|
||||
vTensor::Memory& wait() const;
|
||||
|
||||
|
|
@ -343,7 +336,7 @@ class vTensor final {
|
|||
Image& image(CMD&, Stage::Flags, Access::Flags) const;
|
||||
Buffer& staging() const;
|
||||
Buffer& staging(CMD&, Stage::Flags, Access::Flags) const;
|
||||
Fence& fence() const;
|
||||
Fence& fence(Access::Flags) const;
|
||||
|
||||
// Validation
|
||||
void verify() const;
|
||||
|
|
@ -485,13 +478,15 @@ vTensor::Future<Type, kAccess>::wait() const & {
|
|||
}
|
||||
|
||||
template<typename Type>
|
||||
inline vTensor::Future<Type, vTensor::Access::Read> vTensor::host() const & {
|
||||
return Future<Type, vTensor::Access::Read>(host());
|
||||
inline vTensor::Future<Type, vTensor::Access::Read>
|
||||
vTensor::host(api::Command::Buffer& command_buffer) const & {
|
||||
return Future<Type, vTensor::Access::Read>(host(command_buffer));
|
||||
}
|
||||
|
||||
template<typename Type, vTensor::Access::Flags kAccess>
|
||||
inline vTensor::Future<Type, kAccess> vTensor::host() & {
|
||||
return Future<Type, kAccess>(host(kAccess));
|
||||
inline vTensor::Future<Type, kAccess>
|
||||
vTensor::host(api::Command::Buffer& command_buffer) & {
|
||||
return Future<Type, kAccess>(host(command_buffer, kAccess));
|
||||
}
|
||||
|
||||
inline bool vTensor::has_image() const {
|
||||
|
|
|
|||
|
|
@ -36,11 +36,11 @@ Tensor upsample_nearest2d(
|
|||
input.options(),
|
||||
};
|
||||
|
||||
api::Command::Buffer command_buffer = context->command().pool.allocate();
|
||||
command_buffer.begin();
|
||||
api::Command::Pool& command_pool = context->command().pool;
|
||||
api::Command::Buffer& command_buffer = command_pool.stream();
|
||||
{
|
||||
if (v_input.has_image()) {
|
||||
const struct {
|
||||
if C10_LIKELY(v_input.has_image()) {
|
||||
const struct Block final {
|
||||
uvec3 extents;
|
||||
uint32_t _;
|
||||
ivec2 iextents;
|
||||
|
|
@ -92,8 +92,7 @@ Tensor upsample_nearest2d(
|
|||
TORCH_CHECK(false, "Not implemented!");
|
||||
}
|
||||
}
|
||||
command_buffer.end();
|
||||
command_buffer.submit(context->gpu().queue);
|
||||
command_pool.submit(context->gpu().queue, command_buffer);
|
||||
|
||||
return convert(v_output);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ namespace vulkan {
|
|||
namespace ops {
|
||||
namespace utils {
|
||||
|
||||
int64_t normalize(
|
||||
inline int64_t normalize(
|
||||
const int64_t dimension,
|
||||
const int64_t n) {
|
||||
return (dimension % n + n) % n;
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user