Optimize Vulkan command buffer submission rate. (#49112)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49112 Differential Revision: D25729889 Test Plan: Imported from OSS Reviewed By: SS-JIA Pulled By: AshkanAliabadi fbshipit-source-id: c4ab470fdcf3f83745971986f3a44a3dff69287f
2025-12-06 12:20:52 +01:00 · 2021-01-08 16:36:28 -08:00 · 2021-01-08 16:36:28 -08:00 · 1c12cbea90
commit 1c12cbea90
parent aa18d17455
30 changed files with 1060 additions and 961 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -207,7 +207,7 @@ cmake_dependent_option(
    USE_VALGRIND "Use Valgrind. Only available on Linux." ON
    "LINUX" OFF)
 option(USE_VULKAN "Use Vulkan GPU backend" OFF)
-option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON)
+option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF)
 option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON)
--- a/aten/src/ATen/native/vulkan/api/Cache.h
+++ b/aten/src/ATen/native/vulkan/api/Cache.h
@ -62,6 +62,10 @@ class Cache final {
  Factory factory_;
 };

+//
+// Impl
+//
+
 template<typename Factory>
 inline Cache<Factory>::Cache(Factory factory)
  : factory_(std::move(factory)) {
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@ -76,6 +76,25 @@ Command::Buffer::Buffer(const VkCommandBuffer command_buffer)
      "Invalid Vulkan command buffer!");
 }

+Command::Buffer::Buffer(Buffer&& buffer)
+  : command_buffer_(std::move(buffer.command_buffer_)),
+    bound_(std::move(buffer.bound_)),
+    barriers_(std::move(buffer.barriers_)) {
+  buffer.invalidate();
+}
+
+Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
+  if (&buffer != this) {
+    command_buffer_ = std::move(buffer.command_buffer_);
+    bound_ = std::move(buffer.bound_);
+    barriers_ = std::move(buffer.barriers_);
+
+    buffer.invalidate();
+  };
+
+  return *this;
+}
+
 void Command::Buffer::Buffer::begin() {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      command_buffer_,
@ -107,69 +126,6 @@ void Command::Buffer::Buffer::end() {
  VK_CHECK(vkEndCommandBuffer(command_buffer_));
 }

-void Command::Buffer::barrier() {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      command_buffer_,
-      "This command buffer is in an invalid state! "
-      "Potential reason: This command buffer is moved from.");
-
-  if (barriers_.stage) {
-    c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
-
-    for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
-      buffer_memory_barriers.push_back({
-            VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
-            nullptr,
-            barrier.memory.src,
-            barrier.memory.dst,
-            VK_QUEUE_FAMILY_IGNORED,
-            VK_QUEUE_FAMILY_IGNORED,
-            barrier.object.handle,
-            barrier.object.offset,
-            barrier.object.range,
-          });
-    }
-
-    c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
-
-    for (const Resource::Image::Barrier& barrier : barriers_.images) {
-      image_memory_barriers.push_back({
-            VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
-            nullptr,
-            barrier.memory.src,
-            barrier.memory.dst,
-            barrier.layout.src,
-            barrier.layout.dst,
-            VK_QUEUE_FAMILY_IGNORED,
-            VK_QUEUE_FAMILY_IGNORED,
-            barrier.object.handle,
-            {
-              VK_IMAGE_ASPECT_COLOR_BIT,
-              0u,
-              VK_REMAINING_MIP_LEVELS,
-              0u,
-              VK_REMAINING_ARRAY_LAYERS,
-            },
-          });
-    }
-
-    vkCmdPipelineBarrier(
-        command_buffer_,
-        barriers_.stage.src,
-        barriers_.stage.dst,
-        0u,
-        0u,
-        nullptr,
-        buffer_memory_barriers.size(),
-        buffer_memory_barriers.data(),
-        image_memory_barriers.size(),
-        image_memory_barriers.data());
-  }
-
-  // Reset
-  barriers_.reset();
-}
-
 void Command::Buffer::barrier(const Pipeline::Barrier& barrier) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      command_buffer_,
@ -291,31 +247,86 @@ void Command::Buffer::dispatch(
          bound_.pipeline.local_work_group.data[2u]));
 }

-void Command::Buffer::submit(
-    const VkQueue queue,
-    const Resource::Fence fence) {
+void Command::Buffer::barrier() {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      command_buffer_,
      "This command buffer is in an invalid state! "
      "Potential reason: This command buffer is moved from.");

-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      queue,
-      "Invalid Vulkan queue!");
+  if (barriers_.stage) {
+    c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;

-  const VkSubmitInfo submit_info{
-    VK_STRUCTURE_TYPE_SUBMIT_INFO,
-    nullptr,
-    0u,
-    nullptr,
-    nullptr,
-    1u,
-    &command_buffer_,
-    0u,
-    nullptr,
-  };
+    for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) {
+      buffer_memory_barriers.push_back({
+            VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            nullptr,
+            barrier.memory.src,
+            barrier.memory.dst,
+            VK_QUEUE_FAMILY_IGNORED,
+            VK_QUEUE_FAMILY_IGNORED,
+            barrier.object.handle,
+            barrier.object.offset,
+            barrier.object.range,
+          });
+    }

-  VK_CHECK(vkQueueSubmit(queue, 1u, &submit_info, fence.handle()));
+    c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
+
+    for (const Resource::Image::Barrier& barrier : barriers_.images) {
+      image_memory_barriers.push_back({
+            VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            nullptr,
+            barrier.memory.src,
+            barrier.memory.dst,
+            barrier.layout.src,
+            barrier.layout.dst,
+            VK_QUEUE_FAMILY_IGNORED,
+            VK_QUEUE_FAMILY_IGNORED,
+            barrier.object.handle,
+            {
+              VK_IMAGE_ASPECT_COLOR_BIT,
+              0u,
+              VK_REMAINING_MIP_LEVELS,
+              0u,
+              VK_REMAINING_ARRAY_LAYERS,
+            },
+          });
+    }
+
+    vkCmdPipelineBarrier(
+        command_buffer_,
+        barriers_.stage.src,
+        barriers_.stage.dst,
+        0u,
+        0u,
+        nullptr,
+        buffer_memory_barriers.size(),
+        buffer_memory_barriers.data(),
+        image_memory_barriers.size(),
+        image_memory_barriers.data());
+  }
+
+  // Reset
+  barriers_.reset();
+}
+
+void Command::Buffer::invalidate() {
+  command_buffer_ = VK_NULL_HANDLE;
+}
+
+inline void Command::Buffer::Bound::reset() {
+  pipeline = {};
+  descriptor_set = VK_NULL_HANDLE;
+}
+
+inline Command::Buffer::Barrier::Stage::operator bool() const {
+  return (0u != src) || (0u != dst);
+}
+
+inline void Command::Buffer::Barrier::reset() {
+  stage = {};
+  buffers.clear();
+  images.clear();
 }

 Command::Pool::Pool(const GPU& gpu)
@ -338,8 +349,9 @@ Command::Pool::Pool(const GPU& gpu)
 Command::Pool::Pool(Pool&& pool)
  : device_(std::move(pool.device_)),
    command_pool_(std::move(pool.command_pool_)),
-    buffer_(std::move(pool.buffer_)) {
-  pool.device_ = VK_NULL_HANDLE;
+    buffer_(std::move(pool.buffer_)),
+    stream_(std::move(pool.stream_)) {
+  pool.invalidate();
 }

 Command::Pool& Command::Pool::operator=(Pool&& pool) {
@ -347,8 +359,9 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) {
    device_ = std::move(pool.device_);
    command_pool_ = std::move(pool.command_pool_);
    buffer_ = std::move(pool.buffer_);
+    stream_ = std::move(pool.stream_);

-    pool.device_ = VK_NULL_HANDLE;
+    pool.invalidate();
  };

  return *this;
@ -383,25 +396,109 @@ Command::Buffer Command::Pool::allocate() {
        Configuration::kQuantum);

    allocate_command_buffers(
-       device_,
-       command_pool_.get(),
-       buffer_.pool.data() + buffer_.in_use,
-       Configuration::kQuantum);
+        device_,
+        command_pool_.get(),
+        buffer_.pool.data() + buffer_.in_use,
+        Configuration::kQuantum);
  }

  return Buffer(buffer_.pool[buffer_.in_use++]);
 }

+Command::Buffer& Command::Pool::stream() {
+  if (!stream_.buffer) {
+    stream_.buffer = allocate();
+    stream_.buffer.begin();
+    stream_.counter = 0u;
+  }
+
+  return stream_.buffer;
+}
+
 void Command::Pool::purge() {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      device_ && command_pool_,
      "This command pool is in an invalid state! "
      "Potential reason: This command pool is moved from.");

+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !stream_.buffer,
+      "Pending command buffer detected.  Make sure all command buffers are "
+      "submitted to the queue for execution prior to reclaiming pool memory.");
+
  buffer_.in_use = 0u;
  VK_CHECK(vkResetCommandPool(device_, command_pool_.get(), 0u));
 }

+void Command::Pool::submit(
+    const VkQueue queue,
+    const c10::ArrayRef<const Buffer> buffers,
+    const Resource::Fence fence) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_ && command_pool_,
+      "This command pool is in an invalid state! "
+      "Potential reason: This command pool is moved from.");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      queue,
+      "Invalid Vulkan queue!");
+
+  c10::SmallVector<VkCommandBuffer, Configuration::kReserve> command_buffers;
+  command_buffers.reserve(buffers.size());
+
+  for (const Buffer& buffer : buffers) {
+    VkCommandBuffer command_buffer = buffer.handle();
+
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        command_buffer,
+        "Invalid Vulkan command buffer!");
+
+    // Are we submitting our one and only command stream, or a regular command
+    // buffer whose scope is manually maintained by the user?  Automatically
+    // maintain state and submission rate if the former.
+
+    if (stream_.buffer.handle() == command_buffer) {
+      // Hand the stream off to the driver if:
+      // - The user has implictly signaled interest in the results via a fence.
+      // - We are over the submission cutoff.  We don't want to starve the GPU.
+
+      if (fence || (stream_.counter++ > Configuration::kSubmit)) {
+        stream_.buffer.end();
+        stream_.buffer.invalidate();
+      }
+      // Skip - Accumulate more calls prior to submission.
+      else {
+        command_buffer = VK_NULL_HANDLE;
+      }
+    }
+
+    if (command_buffer) {
+      command_buffers.push_back(command_buffer);
+    }
+  }
+
+  if (!command_buffers.empty()) {
+    const VkSubmitInfo submit_info{
+      VK_STRUCTURE_TYPE_SUBMIT_INFO,
+      nullptr,
+      0u,
+      nullptr,
+      nullptr,
+      command_buffers.size(),
+      command_buffers.data(),
+      0u,
+      nullptr,
+    };
+
+    VK_CHECK(vkQueueSubmit(queue, 1u, &submit_info, fence.handle()));
+  }
+}
+
+void Command::Pool::invalidate() {
+  device_ = VK_NULL_HANDLE;
+  command_pool_.reset();
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@ -7,6 +7,7 @@
 #include <ATen/native/vulkan/api/Pipeline.h>
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Shader.h>
+#include <c10/util/ArrayRef.h>

 namespace at {
 namespace native {
@ -14,13 +15,15 @@ namespace vulkan {
 namespace api {

 struct Command final {
+  class Pool;
+
  //
  // Buffer
  //

  class Buffer final {
   public:
-    Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
+    explicit Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE);
    Buffer(const Buffer&) = delete;
    Buffer& operator=(const Buffer&) = delete;
    Buffer(Buffer&&);
@ -28,18 +31,22 @@ struct Command final {
    ~Buffer() = default;

    operator bool() const;
+    VkCommandBuffer handle() const;

    void begin();
    void end();
+
    void barrier(const Pipeline::Barrier& barrier);
    void bind(const Pipeline::Object& pipeline);
    void bind(const Descriptor::Set& set);
    void copy(Resource::Buffer::Object source, Resource::Buffer::Object destination);
    void dispatch(const Shader::WorkGroup& global_work_group);
-    void submit(VkQueue queue, Resource::Fence fence = {});

   private:
+    friend class Pool;
+
    void barrier();
+    void invalidate();

   private:
    VkCommandBuffer command_buffer_;
@ -80,12 +87,22 @@ struct Command final {
    ~Pool();

    Buffer allocate();
+    Buffer& stream();
    void purge();

+    void submit(
+        VkQueue queue,
+        c10::ArrayRef<const Buffer> buffers,
+        Resource::Fence fence = {});
+
+   private:
+    void invalidate();
+
   private:
    struct Configuration final {
-      static constexpr uint32_t kQuantum = 64u;
-      static constexpr uint32_t kReserve = 1024u;
+      static constexpr uint32_t kQuantum = 4u;
+      static constexpr uint32_t kReserve = 16u;
+      static constexpr uint32_t kSubmit = 10u;
    };

    VkDevice device_;
@ -95,6 +112,11 @@ struct Command final {
      std::vector<VkCommandBuffer> pool;
      size_t in_use;
    } buffer_;
+
+    struct {
+      Buffer buffer;
+      uint32_t counter;
+    } stream_;
  } pool /* [thread_count] */;

  explicit Command(const GPU& gpu)
@ -106,43 +128,12 @@ struct Command final {
 // Impl
 //

-inline Command::Buffer::Buffer(Buffer&& buffer)
-  : command_buffer_(std::move(buffer.command_buffer_)),
-    bound_(std::move(buffer.bound_)),
-    barriers_(std::move(buffer.barriers_)) {
-  buffer.command_buffer_ = VK_NULL_HANDLE;
-}
-
-inline Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) {
-  if (&buffer != this) {
-    command_buffer_ = std::move(buffer.command_buffer_);
-    bound_ = std::move(buffer.bound_);
-    barriers_ = std::move(buffer.barriers_);
-
-    buffer.command_buffer_ = VK_NULL_HANDLE;
-  };
-
-  return *this;
-}
-
 inline Command::Buffer::operator bool() const {
  return VK_NULL_HANDLE != command_buffer_;
 }

-inline void Command::Buffer::Bound::reset() {
-  pipeline = {};
-  descriptor_set = VK_NULL_HANDLE;
-}
-
-inline Command::Buffer::Barrier::Stage::operator bool() const {
-  return (0u != src) ||
-         (0u != dst);
-}
-
-inline void Command::Buffer::Barrier::reset() {
-  stage = {};
-  buffers.clear();
-  images.clear();
+inline VkCommandBuffer Command::Buffer::handle() const {
+  return command_buffer_;
 }

 } // namespace api
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@ -6,10 +6,17 @@

 #ifdef USE_VULKAN_SHADERC_RUNTIME
 #include <ATen/native/vulkan/glsl.h>
-#define VK_KERNEL(name) { name##_glsl, }
+#define VK_KERNEL(name)                          \
+  ::at::native::vulkan::api::Shader::Descriptor{ \
+    name##_glsl,                                 \
+  }
 #else
 #include <ATen/native/vulkan/spv.h>
-#define VK_KERNEL(name) { name##_spv, name##_spv_len, }
+#define VK_KERNEL(name)                          \
+  ::at::native::vulkan::api::Shader::Descriptor{ \
+    name##_spv,                                  \
+    name##_spv_len,                              \
+  }
 #endif /* USE_VULKAN_SHADERC_RUNTIME */

 #ifdef USE_VULKAN_WRAPPER
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@ -43,6 +43,40 @@ VkDevice create_device(
    &queue_priorities,
  };

+  uint32_t device_extension_properties_count = 0;
+  VK_CHECK(vkEnumerateDeviceExtensionProperties(
+      physical_device,
+      nullptr,
+      &device_extension_properties_count,
+      nullptr));
+
+  std::vector<VkExtensionProperties> device_extension_properties(
+      device_extension_properties_count);
+
+  VK_CHECK(vkEnumerateDeviceExtensionProperties(
+      physical_device,
+      nullptr,
+      &device_extension_properties_count,
+      device_extension_properties.data()));
+
+  constexpr const char* const requested_device_extensions[]{
+  #ifdef VK_KHR_portability_subset
+    // https://vulkan.lunarg.com/doc/view/1.2.162.0/mac/1.2-extensions/vkspec.html#VUID-VkDeviceCreateInfo-pProperties-04451
+    VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+  #endif
+  };
+
+  std::vector<const char*> enabled_device_extensions;
+
+  for (const auto& requested_device_extension : requested_device_extensions) {
+    for (const auto& extension : device_extension_properties) {
+      if (strcmp(requested_device_extension, extension.extensionName) == 0) {
+        enabled_device_extensions.push_back(requested_device_extension);
+        break;
+      }
+    }
+  }
+
  const VkDeviceCreateInfo device_create_info{
    VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
    nullptr,
@ -51,7 +85,8 @@ VkDevice create_device(
    &device_queue_create_info,
    0u,
    nullptr,
-    0u,
+    static_cast<uint32_t>(enabled_device_extensions.size()),
+    enabled_device_extensions.data(),
    nullptr,
  };

--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@ -128,27 +128,25 @@ Descriptor::Set::Set(
      "Invalid Vulkan descriptor set!");
 }

-void Descriptor::Set::update(const Item& item) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      device_ && descriptor_set_,
-      "This descriptor set is in an invalid state! "
-      "Potential reason: This descriptor set is moved from.");
+Descriptor::Set::Set(Set&& set)
+  : device_(std::move(set.device_)),
+    descriptor_set_(std::move(set.descriptor_set_)),
+    shader_layout_signature_(std::move(set.shader_layout_signature_)),
+    bindings_(std::move(set.bindings_)) {
+  set.invalidate();
+}

-  const auto items_itr = std::find_if(
-      bindings_.items.begin(),
-      bindings_.items.end(),
-      [binding = item.binding](const Item& other) {
-        return other.binding == binding;
-      });
+Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
+  if (&set != this) {
+    device_ = std::move(set.device_);
+    descriptor_set_ = std::move(set.descriptor_set_);
+    shader_layout_signature_ = std::move(set.shader_layout_signature_);
+    bindings_ = std::move(set.bindings_);

-  if (bindings_.items.end() == items_itr) {
-     bindings_.items.emplace_back(item);
-  }
-  else {
-    *items_itr = item;
-  }
+    set.invalidate();
+  };

-  bindings_.dirty = true;
+  return *this;
 }

 Descriptor::Set& Descriptor::Set::bind(
@ -276,12 +274,39 @@ VkDescriptorSet Descriptor::Set::handle() const {
  return descriptor_set_;
 }

+void Descriptor::Set::invalidate() {
+  device_ = VK_NULL_HANDLE;
+  descriptor_set_ = VK_NULL_HANDLE;
+}
+
+void Descriptor::Set::update(const Item& item) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_ && descriptor_set_,
+      "This descriptor set is in an invalid state! "
+      "Potential reason: This descriptor set is moved from.");
+
+  const auto items_itr = std::find_if(
+      bindings_.items.begin(),
+      bindings_.items.end(),
+      [binding = item.binding](const Item& other) {
+        return other.binding == binding;
+      });
+
+  if (bindings_.items.end() == items_itr) {
+     bindings_.items.emplace_back(item);
+  }
+  else {
+    *items_itr = item;
+  }
+
+  bindings_.dirty = true;
+}
+
 Descriptor::Pool::Pool(const GPU& gpu)
  : device_(gpu.device),
    descriptor_pool_(
        create_descriptor_pool(gpu.device),
-        VK_DELETER(DescriptorPool)(device_)),
-    set_{} {
+        VK_DELETER(DescriptorPool)(device_)) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      device_,
      "Invalid Vulkan device!");
@ -295,7 +320,7 @@ Descriptor::Pool::Pool(Pool&& pool)
  : device_(std::move(pool.device_)),
    descriptor_pool_(std::move(pool.descriptor_pool_)),
    set_(std::move(pool.set_)) {
-  pool.device_ = VK_NULL_HANDLE;
+  pool.invalidate();
 }

 Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
@ -304,7 +329,7 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) {
    descriptor_pool_ = std::move(pool.descriptor_pool_);
    set_ = std::move(pool.set_);

-    pool.device_ = VK_NULL_HANDLE;
+    pool.invalidate();
  };

  return *this;
@ -371,8 +396,13 @@ void Descriptor::Pool::purge() {
      "This descriptor pool is in an invalid state! "
      "Potential reason: This descriptor pool is moved from.");

-  set_.layouts.clear();
  VK_CHECK(vkResetDescriptorPool(device_, descriptor_pool_.get(), 0u));
+  set_.layouts.clear();
+}
+
+void Descriptor::Pool::invalidate() {
+  device_ = VK_NULL_HANDLE;
+  descriptor_pool_.reset();
 }

 } // namespace api
--- a/aten/src/ATen/native/vulkan/api/Descriptor.h
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.h
@ -73,6 +73,9 @@ struct Descriptor final {

    VkDescriptorSet handle() const;

+   private:
+    void invalidate();
+
   private:
    struct Item final {
      uint32_t binding;
@ -113,6 +116,9 @@ struct Descriptor final {
    Set allocate(const Shader::Layout::Object& shader_layout);
    void purge();

+   private:
+    void invalidate();
+
   private:
    struct Configuration final {
      static constexpr uint32_t kQuantum = 16u;
@ -137,33 +143,6 @@ struct Descriptor final {
  }
 };

-//
-// Impl
-//
-
-inline Descriptor::Set::Set(Set&& set)
-  : device_(std::move(set.device_)),
-    descriptor_set_(std::move(set.descriptor_set_)),
-    shader_layout_signature_(std::move(set.shader_layout_signature_)),
-    bindings_(std::move(set.bindings_)) {
-  set.device_ = VK_NULL_HANDLE;
-  set.descriptor_set_ = VK_NULL_HANDLE;
-}
-
-inline Descriptor::Set& Descriptor::Set::operator=(Set&& set) {
-  if (&set != this) {
-    device_ = std::move(set.device_);
-    descriptor_set_ = std::move(set.descriptor_set_);
-    shader_layout_signature_ = std::move(set.shader_layout_signature_);
-    bindings_ = std::move(set.bindings_);
-
-    set.device_ = VK_NULL_HANDLE;
-    set.descriptor_set_ = VK_NULL_HANDLE;
-  };
-
-  return *this;
-}
-
 } // namespace api
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@ -169,6 +169,10 @@ Pipeline::Cache::Cache(Factory factory)
  : cache_(std::move(factory)) {
 }

+void Pipeline::Cache::purge() {
+  cache_.purge();
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@ -196,7 +196,11 @@ inline Pipeline::Barrier::operator bool() const {
 inline bool operator==(
    const Pipeline::Layout::Descriptor& _1,
    const Pipeline::Layout::Descriptor& _2) {
-  return (_1.descriptor_set_layout == _2.descriptor_set_layout);
+  static_assert(
+      std::is_trivially_copyable<Pipeline::Layout::Descriptor>::value,
+      "This implementation is no longer valid!");
+
+  return (0 == memcmp(&_1, &_2, sizeof(Pipeline::Layout::Descriptor)));
 }

 inline size_t Pipeline::Layout::Factory::Hasher::operator()(
@ -207,9 +211,11 @@ inline size_t Pipeline::Layout::Factory::Hasher::operator()(
 inline bool operator==(
    const Pipeline::Descriptor& _1,
    const Pipeline::Descriptor& _2) {
-  return (_1.pipeline_layout == _2.pipeline_layout) &&
-         (_1.shader_module == _2.shader_module) &&
-         (_1.local_work_group == _2.local_work_group);
+  static_assert(
+      std::is_trivially_copyable<Pipeline::Descriptor>::value,
+      "This implementation is no longer valid!");
+
+  return (0 == memcmp(&_1, &_2, sizeof(Pipeline::Descriptor)));
 }

 inline size_t Pipeline::Factory::Hasher::operator()(
@ -236,10 +242,6 @@ inline Pipeline::Object Pipeline::Cache::retrieve(
  };
 }

-inline void Pipeline::Cache::purge() {
-  cache_.purge();
-}
-
 } // namespace api
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@ -192,6 +192,11 @@ VkFence Resource::Fence::handle(const bool add_to_waitlist) const {
      "Invalid Vulkan fence!");

  const VkFence fence = pool->fence_.pool[id].get();
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      fence,
+      "Invalid Vulkan fence!");
+
  if (add_to_waitlist) {
    pool->fence_.waitlist.push_back(fence);
  }
@ -360,14 +365,13 @@ Resource::Pool::Pool(
  : device_(gpu.device),
    allocator_(
        create_allocator(
-          gpu.adapter->runtime->instance(),
-          gpu.adapter->handle,
-          device_),
+            gpu.adapter->runtime->instance(),
+            gpu.adapter->handle,
+            device_),
        vmaDestroyAllocator),
    memory_{
      std::move(policy),
    },
-    buffer_{},
    image_{
      .sampler = Image::Sampler{gpu},
    },
@ -377,6 +381,31 @@ Resource::Pool::Pool(
  fence_.pool.reserve(Configuration::kReserve);
 }

+Resource::Pool::Pool(Pool&& pool)
+  : device_(std::move(pool.device_)),
+    allocator_(std::move(pool.allocator_)),
+    memory_(std::move(pool.memory_)),
+    buffer_(std::move(pool.buffer_)),
+    image_(std::move(pool.image_)),
+    fence_(std::move(pool.fence_)) {
+  pool.invalidate();
+}
+
+Resource::Pool& Resource::Pool::operator=(Pool&& pool) {
+  if (&pool != this) {
+    device_ = std::move(pool.device_);
+    allocator_ = std::move(pool.allocator_);
+    memory_ = std::move(pool.memory_);
+    buffer_ = std::move(pool.buffer_);
+    image_ = std::move(pool.image_);
+    fence_ = std::move(pool.fence_);
+
+    pool.invalidate();
+  };
+
+  return *this;
+}
+
 Resource::Pool::~Pool() {
  try {
    if (device_ && allocator_) {
@ -394,31 +423,6 @@ Resource::Pool::~Pool() {
  }
 }

-Resource::Pool::Pool(Pool&& pool)
-  : device_(std::move(pool.device_)),
-    allocator_(std::move(pool.allocator_)),
-    memory_(std::move(pool.memory_)),
-    buffer_(std::move(pool.buffer_)),
-    image_(std::move(pool.image_)),
-    fence_(std::move(pool.fence_)) {
-  pool.device_ = VK_NULL_HANDLE;
-}
-
-Resource::Pool& Resource::Pool::operator=(Pool&& pool) {
-  if (&pool != this) {
-    device_ = std::move(pool.device_);
-    allocator_ = std::move(pool.allocator_);
-    memory_ = std::move(pool.memory_);
-    buffer_ = std::move(pool.buffer_);
-    image_ = std::move(pool.image_);
-    fence_ = std::move(pool.fence_);
-
-    pool.device_ = VK_NULL_HANDLE;
-  };
-
-  return *this;
-}
-
 Resource::Buffer Resource::Pool::buffer(
    const Buffer::Descriptor& descriptor) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
@ -678,6 +682,11 @@ void Resource::Pool::purge() {
  buffer_.pool.clear();
 }

+void Resource::Pool::invalidate() {
+  device_ = VK_NULL_HANDLE;
+  allocator_.reset();
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@ -20,15 +20,6 @@ struct Resource final {
  //

  struct Memory final {
-    /*
-      Barrier
-    */
-
-    struct Barrier final {
-      VkAccessFlags src;
-      VkAccessFlags dst;
-    };
-
    /*
      Descriptor
    */
@ -39,8 +30,18 @@ struct Resource final {
      VkMemoryPropertyFlags /* optional */ preferred;
    };

-    VmaAllocator allocator;
-    VmaAllocation allocation;
+    /*
+      Barrier
+    */
+
+    struct Barrier final {
+      VkAccessFlags src;
+      VkAccessFlags dst;
+    };
+
+    /*
+      Access
+    */

    struct Access final {
      typedef uint8_t Flags;
@ -74,6 +75,9 @@ struct Resource final {
        typename Pointer = Access::Pointer<Type, kAccess>>
    Handle<Pointer> map() &;

+    VmaAllocator allocator;
+    VmaAllocation allocation;
+
   private:
    // Intentionally disabed to ensure memory access is always properly
    // encapsualted in a scoped map-unmap region.  Allowing below overloads
@ -299,6 +303,8 @@ struct Resource final {
   private:
    friend struct Fence;

+    void invalidate();
+
   private:
    struct Configuration final {
      static constexpr uint32_t kReserve = 256u;
@ -353,7 +359,8 @@ class Resource::Memory::Scope final {

 template<typename, typename Pointer>
 inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {
-  void* map(const Memory& memory, Access::Flags);
+  // Forward declaration
+  void* map(const Memory&, Access::Flags);

  return Handle<Pointer>{
    reinterpret_cast<Pointer>(map(*this, Access::Read)),
@ -363,7 +370,8 @@ inline Resource::Memory::Handle<Pointer> Resource::Memory::map() const & {

 template<typename, Resource::Memory::Access::Flags kAccess, typename Pointer>
 inline Resource::Memory::Handle<Pointer> Resource::Memory::map() & {
-  void* map(const Memory& memory, Access::Flags);
+  // Forward declaration
+  void* map(const Memory&, Access::Flags);

  static_assert(
      (kAccess == Access::Read) ||
@ -388,10 +396,11 @@ inline Resource::Buffer::operator bool() const {
 inline bool operator==(
    const Resource::Image::Sampler::Descriptor& _1,
    const Resource::Image::Sampler::Descriptor& _2) {
-    return (_1.filter == _2.filter) &&
-           (_1.mipmap_mode == _2.mipmap_mode) &&
-           (_1.address_mode == _2.address_mode) &&
-           (_1.border == _2.border);
+    static_assert(
+      std::is_trivially_copyable<Resource::Image::Sampler::Descriptor>::value,
+      "This implementation is no longer valid!");
+
+  return (0 == memcmp(&_1, &_2, sizeof(Resource::Image::Sampler::Descriptor)));
 }

 inline size_t Resource::Image::Sampler::Factory::Hasher::operator()(
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@ -86,7 +86,9 @@ VkInstance create_instance(const Runtime::Type type) {
        nullptr, &instance_extension_count, instance_extension_properties.data()));

    constexpr const char* const requested_instance_extensions[]{
+    #ifdef VK_EXT_debug_report
      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
+    #endif
    };

    for (const auto& requested_instance_extension : requested_instance_extensions) {
--- a/aten/src/ATen/native/vulkan/api/Runtime.h
+++ b/aten/src/ATen/native/vulkan/api/Runtime.h
@ -33,10 +33,7 @@ class Runtime final {
  Runtime& operator=(Runtime&&) = default;
  ~Runtime() = default;

-  inline VkInstance instance() const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
-    return instance_.get();
-  }
+  VkInstance instance() const;

  typedef std::function<bool (const Adapter&)> Selector;
  Adapter select(const Selector& selector);
@ -59,6 +56,15 @@ class Runtime final {

 Runtime* runtime();

+//
+// Impl
+//
+
+inline VkInstance Runtime::instance() const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
+  return instance_.get();
+}
+
 } // namespace api
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@ -60,6 +60,10 @@ Shader::Layout::Cache::Cache(Factory factory)
  : cache_(std::move(factory)) {
 }

+void Shader::Layout::Cache::purge() {
+  cache_.purge();
+}
+
 #ifdef USE_VULKAN_SHADERC_RUNTIME

 struct Shader::Factory::Compiler final {
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@ -218,16 +218,14 @@ inline Shader::Layout::Object Shader::Layout::Cache::retrieve(
  };
 }

-inline void Shader::Layout::Cache::purge() {
-  cache_.purge();
-}
-
 inline bool operator==(
    const Shader::WorkGroup& _1,
    const Shader::WorkGroup& _2) {
-  return (_1.data[0u] == _2.data[0u]) &&
-         (_1.data[1u] == _2.data[1u]) &&
-         (_1.data[2u] == _2.data[2u]);
+  static_assert(
+      std::is_trivially_copyable<Shader::WorkGroup>::value,
+      "This implementation is no longer valid!");
+
+  return (0 == memcmp(&_1, &_2, sizeof(Shader::WorkGroup)));
 }

 inline Shader::Descriptor::Descriptor(const char* const glsl)
@ -258,12 +256,10 @@ inline bool operator==(
    const Shader::Descriptor& _1,
    const Shader::Descriptor& _2) {
  static_assert(
-      sizeof(Shader::Descriptor::shader.source) == sizeof(Shader::Descriptor::shader.binary),
-      "This implementation requires sizeof(Source) to be equal to sizeof(Binary).");
+      std::is_trivially_copyable<Shader::Descriptor>::value,
+      "This implementation is no longer valid!");

-  return (_1.type == _2.type) &&
-         (_1.shader.binary.spirv == _2.shader.binary.spirv) &&
-         (_1.shader.binary.size == _2.shader.binary.size);
+  return (0 == memcmp(&_1, &_2, sizeof(Shader::Descriptor)));
 }

 inline size_t Shader::Factory::Hasher::operator()(
@ -286,11 +282,11 @@ inline size_t Shader::Factory::Hasher::operator()(
 inline bool operator==(
    const VkDescriptorSetLayoutBinding& _1,
    const VkDescriptorSetLayoutBinding& _2) {
-  return (_1.binding == _2.binding) &&
-         (_1.descriptorType == _2.descriptorType) &&
-         (_1.descriptorCount == _2.descriptorCount) &&
-         (_1.stageFlags == _2.stageFlags) &&
-         (_1.pImmutableSamplers == _2.pImmutableSamplers);
+  static_assert(
+      std::is_trivially_copyable<VkDescriptorSetLayoutBinding>::value,
+      "This implementation is no longer valid!");
+
+  return (0 == memcmp(&_1, &_2, sizeof(VkDescriptorSetLayoutBinding)));
 }

 #endif /* USE_VULKAN_API */
--- a/aten/src/ATen/native/vulkan/ops/Add.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Add.cpp
@ -24,11 +24,11 @@ Tensor add_scalar(
    v_self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_output.has_image() && v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        float other;
      } block {
@ -64,8 +64,7 @@ Tensor add_scalar(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
@ -82,11 +81,11 @@ Tensor& add_scalar_(

  vTensor& v_self = convert(self);

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        float other;
      } block {
@ -116,8 +115,7 @@ Tensor& add_scalar_(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return self;
 }
@ -140,11 +138,11 @@ Tensor add_tensor(
    v_self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_self.has_image() && v_other.has_image()) {
-      const struct {
+    if C10_LIKELY(v_self.has_image() && v_other.has_image()) {
+      const struct Block final {
        uvec3 extents;
        float alpha;
      } block {
@ -186,8 +184,7 @@ Tensor add_tensor(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
@ -207,11 +204,11 @@ Tensor& add_tensor_(
  const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan();
  const vTensor& v_other = convert(other);

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
-      const struct {
+    if C10_LIKELY(v_self.has_image() && v_other.has_image() && !self.is_same(other)) {
+      const struct Block final {
        uvec3 extents;
        float alpha;
      } block {
@ -247,8 +244,7 @@ Tensor& add_tensor_(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return self;
 }
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@ -28,11 +28,11 @@ Tensor clamp(
    v_self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_output.has_image() && v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        uint32_t _;
        vec2 clamp;
@ -73,8 +73,7 @@ Tensor clamp(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
@ -95,11 +94,11 @@ Tensor& clamp_(

  vTensor& v_self = convert(self);

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        uint32_t _;
        vec2 clamp;
@ -134,8 +133,7 @@ Tensor& clamp_(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return self;
 }
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@ -35,14 +35,6 @@ struct Layout final {
  };
 };

-struct Experimentation {
-  static constexpr bool kUseConv2dOldApi = false;
-};
-
-struct ConvPrepackLimits final {
-  static constexpr int64_t maxStackDepth = 2048*4;
-};
-
 } // namespace ops
 } // namespace vulkan
 } // namespace native
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@ -1,8 +1,8 @@
 #include <ATen/native/vulkan/ops/Convolution.h>
-#include <ATen/native/vulkan/api/Utils.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/vulkan/ops/Persistent.h>
+#include <ATen/native/vulkan/api/Utils.h>

 namespace at {
 namespace native {
@ -12,6 +12,10 @@ namespace {

 using namespace api::utils;

+struct Experimentation final {
+  static constexpr bool kUseConv2dOldApi = false;
+};
+
 inline bool is_depthwise(
    const IntArrayRef filter,
    const int64_t groups) {
@ -26,47 +30,103 @@ inline bool is_pointwise(const IntArrayRef filter) {
 }

 vTensor pack_weights_dw(
+    api::Context* const context,
+    api::Command::Buffer& command_buffer,
    api::Resource::Pool& pool,
-    const Tensor& weight_arg,
-    const int64_t groups) {
-  if (weight_arg.is_vulkan()) {
-    return convert(weight_arg);
-  }
-
+    const Tensor& weight) {
  /* Source */
-
-  const Tensor weight = weight_arg.contiguous();
  const IntArrayRef src_filter = weight.sizes();
  const float* const src_weight_ptr = weight.data_ptr<float>();

  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
+  const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
+  const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input];
  const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
+
+  /* Destination */
+  const int64_t dst_kw_sz = src_kernel_sz;
+  const int64_t dst_kh_sz = num_stacks;
+  const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
+
  vTensor v_weight{
-      api::context(),
+      context,
      &pool,
      {
          4,
-          num_stacks,
-          src_kw_sz * src_kh_sz,
+          dst_kh_sz,
+          dst_kw_sz,
      },
      weight.options(),
  };

  using Future = vTensor::Future<float, vTensor::Access::Write>;
-  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
  Future::Payload v_weight_payload = v_weight_future.wait();

+  float* const dst_weight_ptr = v_weight_payload.get();
+  memset(dst_weight_ptr, 0, v_weight.nbytes());
+
+  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
+    /* Source */
+    const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
+
+    /* Destination */
+    const int64_t dst_oh = src_oc / 4;
+    const int64_t dst_c = src_oc % 4;
+
+    float* const dst_weight_c_ptr = dst_weight_ptr +
+                                    dst_c * dst_kernel_sz +
+                                    dst_oh * dst_kw_sz;
+
+    for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
+      memcpy(
+          dst_weight_c_ptr + src_ih * src_kw_sz,
+          src_weight_oc_ptr + src_ih * src_kw_sz,
+          sizeof(float) * src_kw_sz);
+    }
+  }
+
+  return v_weight;
+}
+
+vTensor pack_weights_2d(
+    api::Context* const context,
+    api::Command::Buffer& command_buffer,
+    api::Resource::Pool& pool,
+    const Tensor& weight) {
  /* Source */
+  const IntArrayRef src_filter = weight.sizes();
+  const float* const src_weight_ptr = weight.data_ptr<float>();
+
+  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
+  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
  const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
-  const int64_t src_block_sz =
-      src_kernel_sz * src_filter[Layout::Filter::input];
+  const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input];
+
+  const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
+  const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));

  /* Destination */
-  const int64_t dst_kw_sz = src_kw_sz * src_kh_sz;
-  const int64_t dst_kh_sz = num_stacks;
+  const int64_t dst_kw_sz = src_kw_sz * stack_depth;
+  const int64_t dst_kh_sz = src_kh_sz * num_stacks;
  const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;

+  vTensor v_weight{
+      context,
+      &pool,
+      {
+          4,
+          dst_kh_sz,
+          dst_kw_sz,
+      },
+      weight.options(),
+  };
+
+  using Future = vTensor::Future<float, vTensor::Access::Write>;
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
+  Future::Payload v_weight_payload = v_weight_future.wait();
+
  float* const dst_weight_ptr = v_weight_payload.get();
  memset(dst_weight_ptr, 0, v_weight.nbytes());

@ -80,26 +140,29 @@ vTensor pack_weights_dw(

    float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;

-    for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
-      memcpy(
-          dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz,
-          src_weight_oc_ptr + src_ih * src_kw_sz,
-          sizeof(float) * src_kw_sz);
+    for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
+      const int64_t dst_ic4 = src_ic / 4;
+
+      for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
+        for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
+          memcpy(
+              dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
+                dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
+              src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
+              sizeof(float));
+        }
+      }
    }
  }

  return v_weight;
 }

-vTensor pack_weights_old(
+vTensor pack_weights_2d_old(
+    api::Context* const context,
+    api::Command::Buffer& command_buffer,
    api::Resource::Pool& pool,
-    const Tensor& weight_arg,
-    const int64_t groups) {
-  if (weight_arg.is_vulkan()) {
-    return convert(weight_arg);
-  }
-
-  const Tensor weight = weight_arg.contiguous();
+    const Tensor& weight) {
  const IntArrayRef src_filter = weight.sizes();
  const float* const src_weight_ptr = weight.data_ptr<float>();

@ -111,7 +174,7 @@ vTensor pack_weights_old(
  const uint32_t KW = src_filter[Layout::Filter::width];

  vTensor v_weight{
-    api::context(),
+    context,
    &pool,
    {
      1,
@ -123,13 +186,13 @@ vTensor pack_weights_old(
  };

  using Future = vTensor::Future<float, vTensor::Access::Write>;
-  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>(command_buffer);
  Future::Payload v_weight_payload = v_weight_future.wait();

  float* const dst_weight_ptr = v_weight_payload.get();
  memset(dst_weight_ptr, 0, v_weight.nbytes());

-  const float* src = src_weight_ptr;
+  const float* const src = src_weight_ptr;
  float* const dst = dst_weight_ptr;

  {
@ -162,7 +225,7 @@ vTensor pack_weights_old(
        dim0_ = dim0;
        dim1_ = dim1;
        dim2_ = dim2;
-        data_ = new float[dim0 * dim1 * dim2 * 4];
+        data_ = new float[dim0 * dim1 * dim2 * 4];  // TODO: memory leak
        memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
      }

@ -211,7 +274,7 @@ vTensor pack_weights_old(
  return v_weight;
 }

-vTensor pack_weights_2d(
+vTensor pack_weights(
    api::Resource::Pool& pool,
    const Tensor& weight_arg,
    const int64_t groups) {
@ -219,81 +282,32 @@ vTensor pack_weights_2d(
    return convert(weight_arg);
  }

+  api::Context* const context = api::context();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();
+
  const Tensor weight = weight_arg.contiguous();
-  const IntArrayRef src_filter = weight.sizes();
-  const float* const src_weight_ptr = weight.data_ptr<float>();

-  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
-  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
-  const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
-  const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
-  vTensor v_weight{
-      api::context(),
-      &pool,
-      {
-          4,
-          src_kh_sz * num_stacks,
-          src_kw_sz * stack_depth,
-      },
-      weight.options(),
-  };
-
-  using Future = vTensor::Future<float, vTensor::Access::Write>;
-  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
-  Future::Payload v_weight_payload = v_weight_future.wait();
-
-  /* Source */
-  const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
-  const int64_t src_block_sz =
-      src_kernel_sz * src_filter[Layout::Filter::input];
-
-  /* Destination */
-  const int64_t dst_kw_sz = src_kw_sz * stack_depth;
-  const int64_t dst_kh_sz = src_kh_sz * num_stacks;
-  const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
-
-  float* const dst_weight_ptr = v_weight_payload.get();
-  memset(dst_weight_ptr, 0, v_weight.nbytes());
-
-  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
-    /* Source */
-    const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
-
-    /* Destination */
-    const int64_t dst_oh = src_oc / 4;
-    const int64_t dst_c = src_oc % 4;
-
-    float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
-
-    for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
-      const int64_t dst_ic4 = src_ic/4;
-      for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
-        for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
-          memcpy(
-              dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
-                dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
-              src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
-              sizeof(float));
-        }
-      }
-    }
-  }
-
-  return v_weight;
-}
-
-vTensor pack_weights(
-    api::Resource::Pool& pool,
-    const Tensor& weight_arg,
-    const int64_t groups) {
-  if (is_depthwise(weight_arg.sizes(), groups)) {
-    return pack_weights_dw(pool, weight_arg, groups);
+  if (is_depthwise(weight.sizes(), groups)) {
+    return pack_weights_dw(
+        context,
+        command_buffer,
+        pool,
+        weight);
  }

  if (Experimentation::kUseConv2dOldApi) {
-    return pack_weights_old(pool, weight_arg, groups);
+    return pack_weights_2d_old(
+        context,
+        command_buffer,
+        pool,
+        weight);
  }
-  return pack_weights_2d(pool, weight_arg, groups);
+
+  return pack_weights_2d(
+      context,
+      command_buffer,
+      pool,
+      weight);
 }

 vTensor pack_biases(
@ -304,8 +318,11 @@ vTensor pack_biases(
    return convert(*bias);
  }

+  api::Context* const context = api::context();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();
+
  vTensor v_bias{
-    api::context(),
+    context,
    &pool,
    {
      // 1D
@ -316,7 +333,7 @@ vTensor pack_biases(

  {
      using Future = vTensor::Future<void, vTensor::Access::Write>;
-      Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
+      Future v_bias_future = v_bias.host<void, vTensor::Access::Write>(command_buffer);
      Future::Payload v_bias_payload = v_bias_future.wait();

      if (bias) {
@ -394,7 +411,8 @@ bool available(
                                        (c10::DeviceType::Vulkan == bias->device().type())) &&
                                       (kFloat == bias->scalar_type()) &&
                                       (transposed ? false /* to be addded in the future */
-                                                   : (weight.size(Layout::Filter::output) == bias->size(Layout::Filter::output))))
+                                                   : (weight.size(Layout::Filter::output) ==
+                                                          bias->size(Layout::Filter::output))))
                                    : true) &&
         // Stride
         (stride[Layout::Parameter::height] > 0) &&
@ -432,7 +450,7 @@ bool usable(const Tensor& input) {
         true;
 }

-void conv2d_depthwise(
+void conv2d_dw(
    api::Context* const context,
    api::Command::Buffer& command_buffer,
    vTensor& v_output,
@ -446,27 +464,39 @@ void conv2d_depthwise(
    const IntArrayRef dilation,
    const float output_min,
    const float output_max) {
-  if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const struct {
-      int32_t kernel_x, kernel_y;
-      int32_t stride_x, stride_y;
-      int32_t padding_x, padding_y;
-      int32_t dilate_x, dilate_y;
-      float clamp_x, clamp_y;
-      int32_t src_filter_w, src_filter_h;
+  if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
+    const struct Block final {
+      ivec2 kernel;
+      ivec2 stride;
+      ivec2 padding;
+      ivec2 dilate;
+      vec2 clamp;
+      ivec2 src_filter;
    } block {
-      safe_downcast<int32_t>(filter[Layout::Filter::width]),
-      safe_downcast<int32_t>(filter[Layout::Filter::height]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::height]),
-      safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
-      safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
-      output_min,
-      output_max,
-      safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
-      safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
+      {
+        safe_downcast<int32_t>(filter[Layout::Filter::width]),
+        safe_downcast<int32_t>(filter[Layout::Filter::height]),
+      },
+      {
+        safe_downcast<int32_t>(stride[Layout::Parameter::width]),
+        safe_downcast<int32_t>(stride[Layout::Parameter::height]),
+      },
+      {
+        safe_downcast<int32_t>(padding[Layout::Parameter::width]),
+        safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+      },
+      {
+        safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
+        safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
+      },
+      {
+        output_min,
+        output_max,
+      },
+      {
+        safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
+        safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
+      },
    };

    context->dispatch(
@ -510,7 +540,7 @@ void conv2d_depthwise(
  }
 }

-void conv2d_pointwise(
+void conv2d_pw(
    api::Context* const context,
    api::Command::Buffer& command_buffer,
    vTensor& v_output,
@ -522,22 +552,29 @@ void conv2d_pointwise(
    const IntArrayRef padding,
    const float output_min,
    const float output_max) {
-  if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-
-    const struct {
-      int32_t kernel_ic, kernel_oc;
-      int32_t stride_x, stride_y;
-      int32_t padding_x, padding_y;
-      float clamp_x, clamp_y;
+  if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
+    const struct Block final {
+      ivec2 kernel;
+      ivec2 stride;
+      ivec2 padding;
+      vec2 clamp;
    } block {
-      safe_downcast<int32_t>(filter[Layout::Filter::input]),
-      safe_downcast<int32_t>(filter[Layout::Filter::output]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::height]),
-      output_min,
-      output_max,
+      {
+        safe_downcast<int32_t>(filter[Layout::Filter::input]),
+        safe_downcast<int32_t>(filter[Layout::Filter::output]),
+      },
+      {
+        safe_downcast<int32_t>(stride[Layout::Parameter::width]),
+        safe_downcast<int32_t>(stride[Layout::Parameter::height]),
+      },
+      {
+        safe_downcast<int32_t>(padding[Layout::Parameter::width]),
+        safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+      },
+      {
+        output_min,
+        output_max,
+      },
    };

    context->dispatch(
@ -595,30 +632,43 @@ void conv2d(
    const IntArrayRef dilation,
    const float output_min,
    const float output_max) {
-  if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const struct {
-      int32_t kernel_x, kernel_y, kernel_ic, kernel_oc;
-      int32_t stride_x, stride_y;
-      int32_t padding_x, padding_y;
-      int32_t dilate_x, dilate_y;
-      float clamp_x, clamp_y;
-      int32_t src_filter_w, src_filter_h, src_filter_w4;
+  if C10_LIKELY(v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
+    const struct Block final {
+      ivec4 kernel;
+      ivec2 stride;
+      ivec2 padding;
+      ivec2 dilate;
+      vec2 clamp;
+      ivec4 src_filter;
    } block {
-      safe_downcast<int32_t>(filter[Layout::Filter::width]),
-      safe_downcast<int32_t>(filter[Layout::Filter::height]),
-      safe_downcast<int32_t>(filter[Layout::Filter::input]),
-      safe_downcast<int32_t>(filter[Layout::Filter::output]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::height]),
-      safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
-      safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
-      output_min,
-      output_max,
-      safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
-      safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
-      safe_downcast<int32_t>(src_filter[Layout::Filter::width]*4),
+      {
+        safe_downcast<int32_t>(filter[Layout::Filter::width]),
+        safe_downcast<int32_t>(filter[Layout::Filter::height]),
+        safe_downcast<int32_t>(filter[Layout::Filter::input]),
+        safe_downcast<int32_t>(filter[Layout::Filter::output]),
+      },
+      {
+        safe_downcast<int32_t>(stride[Layout::Parameter::width]),
+        safe_downcast<int32_t>(stride[Layout::Parameter::height]),
+      },
+      {
+        safe_downcast<int32_t>(padding[Layout::Parameter::width]),
+        safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+      },
+      {
+        safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
+        safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
+      },
+      {
+        output_min,
+        output_max,
+      },
+      {
+        safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
+        safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
+        safe_downcast<int32_t>(src_filter[Layout::Filter::width] * 4),
+        0,
+      },
    };

    context->dispatch(
@ -662,6 +712,98 @@ void conv2d(
  }
 }

+void conv2d_old(
+    api::Context* const context,
+    api::Command::Buffer& command_buffer,
+    vTensor& v_output,
+    const vTensor& v_input,
+    const vTensor& v_weight,
+    const vTensor& v_bias,
+    const IntArrayRef filter,
+    const IntArrayRef stride,
+    const IntArrayRef padding,
+    const IntArrayRef dilation,
+    const float output_min,
+    const float output_max) {
+  using namespace api::utils;
+
+  if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
+    const int32_t W = v_input.extents().data[0];
+    const int32_t H = v_input.extents().data[1];
+    const int32_t C_4 = v_input.extents().data[2];
+    const int32_t C = 4 * C_4;
+
+    const int32_t OW = v_output.extents().data[0];
+    const int32_t OH = v_output.extents().data[1];
+    const int32_t OC_4 = v_output.extents().data[2];
+    const int32_t OC = 4 * OC_4;
+
+    const struct Block final {
+      int32_t padding_x, padding_y;
+      int32_t kernel_x, kernel_y;
+      int32_t stride_x, stride_y;
+      int32_t dilate_x, dilate_y;
+      int32_t outputSize[4];
+      int32_t inputSize[4];
+      float outputMin;
+      float outputMax;
+    } block {
+      safe_downcast<int32_t>(padding[Layout::Parameter::width]),
+      safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+      safe_downcast<int32_t>(filter[Layout::Filter::width]),
+      safe_downcast<int32_t>(filter[Layout::Filter::height]),
+      safe_downcast<int32_t>(stride[Layout::Parameter::width]),
+      safe_downcast<int32_t>(stride[Layout::Parameter::height]),
+      safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
+      safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
+      { OW, OH, OC_4, OC },
+      { W, H, C_4, C },
+      output_min,
+      output_max,
+    };
+
+    context->dispatch(
+        command_buffer,
+        {
+          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        },
+        VK_KERNEL(conv2d_nogroup_clamp),
+        //VK_KERNEL(conv2d_nogroup_clamp_1x),
+        v_output.extents(),
+        // Write-only access bypasses synchronization but inserts appropriate
+        // barriers if necessary.
+        v_output.image(
+          command_buffer,
+          vTensor::Stage::Compute,
+          vTensor::Access::Write),
+        // Read-only access is implied on const tensors and triggers an async
+        // synchronization if necessary.
+        v_input.image(
+          command_buffer,
+          vTensor::Stage::Compute),
+        // Read-only access is implied on const tensors and triggers an async
+        // synchronization if necessary.
+        v_weight.image(
+          command_buffer,
+          vTensor::Stage::Compute),
+        // Read-only access is implied on const tensors and triggers an async
+        // synchronization if necessary.
+        v_bias.buffer(
+          command_buffer,
+          vTensor::Stage::Compute),
+        // Object lifetime is managed by the resource pool.
+        // It is OK not to keep track of the handle.
+        context->resource().pool.uniform(block).object);
+  }
+  else {
+    TORCH_CHECK(false, "Not implemented!");
+  }
+}
+
 Tensor convolution(
    const Tensor& input,
    const Tensor& weight,
@ -781,99 +923,6 @@ Conv2dOpContext Conv2dOpContext::create(
  };
 }

-void conv2d_old(
-    api::Context* const context,
-    api::Command::Buffer& command_buffer,
-    vTensor& v_output,
-    const vTensor& v_input,
-    const vTensor& v_weight,
-    const vTensor& v_bias,
-    const IntArrayRef filter,
-    const IntArrayRef stride,
-    const IntArrayRef padding,
-    const IntArrayRef dilation,
-    const float output_min,
-    const float output_max) {
-
-  using namespace api::utils;
-
-  if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const int32_t W = v_input.extents().data[0];
-    const int32_t H = v_input.extents().data[1];
-    const int32_t C_4 = v_input.extents().data[2];
-    const int32_t C = 4 * C_4;
-
-    const int32_t OW = v_output.extents().data[0];
-    const int32_t OH = v_output.extents().data[1];
-    const int32_t OC_4 = v_output.extents().data[2];
-    const int32_t OC = 4 * OC_4;
-
-    const struct {
-      int32_t padding_x, padding_y;
-      int32_t kernel_x, kernel_y;
-      int32_t stride_x, stride_y;
-      int32_t dilate_x, dilate_y;
-      int32_t outputSize[4];
-      int32_t inputSize[4];
-      float outputMin;
-      float outputMax;
-    } block {
-      safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-      safe_downcast<int32_t>(padding[Layout::Parameter::height]),
-      safe_downcast<int32_t>(filter[Layout::Filter::width]),
-      safe_downcast<int32_t>(filter[Layout::Filter::height]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-      safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-      safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
-      safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
-      { OW, OH, OC_4, OC },
-      { W, H, C_4, C },
-      output_min,
-      output_max,
-    };
-
-    context->dispatch(
-        command_buffer,
-        {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-        },
-        VK_KERNEL(conv2d_nogroup_clamp),
-        //VK_KERNEL(conv2d_nogroup_clamp_1x),
-        v_output.extents(),
-        // Write-only access bypasses synchronization but inserts appropriate
-        // barriers if necessary.
-        v_output.image(
-          command_buffer,
-          vTensor::Stage::Compute,
-          vTensor::Access::Write),
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        v_input.image(
-          command_buffer,
-          vTensor::Stage::Compute),
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        v_weight.image(
-          command_buffer,
-          vTensor::Stage::Compute),
-        // Read-only access is implied on const tensors and triggers an async
-        // synchronization if necessary.
-        v_bias.buffer(
-          command_buffer,
-          vTensor::Stage::Compute),
-        // Object lifetime is managed by the resource pool.
-        // It is OK not to keep track of the handle.
-        context->resource().pool.uniform(block).object);
-  }
-  else {
-    TORCH_CHECK(false, "Not implemented!");
-  }
-}
-
 Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
  api::Context* const context = api::context();

@ -896,11 +945,11 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
    input.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
    if (is_depthwise(unpacked_.filter, unpacked_.groups)) {
-      conv2d_depthwise(
+      conv2d_dw(
          context,
          command_buffer,
          v_output,
@ -932,7 +981,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
            packed_.output_max);
      } else {
        if (is_pointwise(unpacked_.filter)) {
-          conv2d_pointwise(
+          conv2d_pw(
              context,
              command_buffer,
              v_output,
@ -964,8 +1013,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
      }
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@ -6,87 +6,96 @@ namespace vulkan {
 namespace ops {

 Tensor& copy_(Tensor& self, const Tensor& src) {
-  // X -> Vulkan
-  if (at::kVulkan == self.device().type()) {
-    vTensor& v_self = convert(self);
+  api::Context* const context = api::context();

-    // CPU -> Vulkan
-    if (at::kCPU == src.device().type()) {
-      // Requesting write-only host access to the tensor never triggers a sync
-      // as the contents will be overwritten regardless.  Having said that,
-      // appropriate barriers are inserted automatically if WAR or WAW hazards
-      // are detected.  Examples of such scenario for instance are if any of
-      // these async operations are on going in the background on 'self':
-      //  - On discrete systems:
-      //      * buffer-to-staging transfers
-      //      * staging-to-buffer transfers
-      // -  On UMA buffer is an alias for staging and accessible both on host
-      //    and device.  Consequently:
-      //      * buffer-to-image NHWC -> NC4HW packing
-      //      * image-to-buffer NC4HW -> NHWC unpacking
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    // X -> Vulkan
+    if (at::kVulkan == self.device().type()) {
+      vTensor& v_self = convert(self);

-      using Future = vTensor::Future<void, vTensor::Access::Write>;
-      Future v_self_future = v_self.host<void, vTensor::Access::Write>();
+      // Vulkan -> Vulkan
+      if (at::kVulkan == src.device().type()) {
+        command_buffer.copy(
+            // - Read-only access is implied on const tensors.  Memory barriers
+            //   are automatically inserted if a RAW hazard is detected.
+            // - Recording any potential pending sync operations into the same
+            //   command buffer prevents an expensive queue submission.
+            convert(src).buffer(
+                command_buffer,
+                vTensor::Stage::Transfer),
+            // - Write-only access never triggers a sync as the contents will be
+            //   overwritten regardless.  Having said that, appropriate barriers
+            //   are inserted automatically if WAR or WAW hazards are detected.
+            // - Recording pending sync operations into the same command buffer
+            //   prevents an expensive queue submission.
+            v_self.buffer(
+                command_buffer,
+                vTensor::Stage::Transfer,
+                vTensor::Access::Write));

-      // This wait() will be a no-op if no hazards are detected, including the
-      // obvious, yet important, special case of 'self' being an empty tensor.
+        command_pool.submit(context->gpu().queue, command_buffer);
+      }
+      // CPU -> Vulkan
+      else {
+        const Tensor cpu_src = src.device().is_cpu() ? src : src.cpu();

-      Future::Payload v_self_payload = v_self_future.wait();
+        // Requesting write-only host access to the tensor never triggers a sync
+        // as the contents will be overwritten regardless.  Having said that,
+        // appropriate barriers are inserted automatically if WAR or WAW hazards
+        // are detected.  Examples of such scenario for instance are if any of
+        // these async operations are on going in the background on 'self':
+        // - On discrete systems:
+        //      * buffer-to-staging transfers
+        //      * staging-to-buffer transfers
+        // - On UMA buffer is an alias for staging and accessible both on host
+        //    and device.  Consequently:
+        //      * buffer-to-image NHWC -> NC4HW packing
+        //      * image-to-buffer NC4HW -> NHWC unpacking

-      memcpy(
-          v_self_payload.get(),
-          src.contiguous().data_ptr<float>(),
-          std::min(src.nbytes(), self.nbytes()));
+        using Future = vTensor::Future<void, vTensor::Access::Write>;
+        Future v_self_future = v_self.host<void, vTensor::Access::Write>(command_buffer);
+
+        // Ideally we would have been able to put as much distance between
+        // requesting the data - a call to host() - and accessing the data
+        // - a call to wait() - but a local view of the computation graph
+        // in eager mode makes that optimization non-trivial.
+
+        // This wait() will be a no-op if no hazards are detected, including the
+        // obvious, yet important, special case of 'self' being an empty tensor.
+
+        Future::Payload v_self_payload = v_self_future.wait();
+
+        memcpy(
+            v_self_payload.get(),
+            cpu_src.contiguous().data_ptr<float>(),
+            std::min(src.nbytes(), self.nbytes()));
+      }
    }
-    // Vulkan -> Vulkan
+    // Vulkan -> X
    else if (at::kVulkan == src.device().type()) {
-      api::Command::Buffer command_buffer = api::context()->command().pool.allocate();
-      command_buffer.begin();
-
-      command_buffer.copy(
-          // - Read-only access is implied on const tensors.  Memory barriers
-          //   are automatically inserted if a RAW hazard is detected.
-          // - Recording any potential pending sync operations into the same
-          //   command buffer prevents an expensive queue submission.
-          convert(src).buffer(
-              command_buffer,
-              vTensor::Stage::Transfer),
-          // - Write-only access never triggers a sync as the contents will be
-          //   overwritten regardless.  Having said that, appropriate barriers
-          //   are inserted automatically if WAR or WAW hazards are detected.
-          // - Recording pending sync operations into the same command buffer
-          //   prevents an expensive queue submission.
-          v_self.buffer(
-              command_buffer,
-              vTensor::Stage::Transfer,
-              vTensor::Access::Write));
-
-      command_buffer.end();
-      command_buffer.submit(api::context()->gpu().queue);
-    }
-    else {
-      TORCH_INTERNAL_ASSERT(false, "Unsupported!");
-    }
-  }
-  // Vulkan -> X
-  else if (at::kVulkan == src.device().type()) {
-    const vTensor& v_src = convert(src);
-
-    {
-      // Similar notes as above applies, with the additional consideration of
-      // potential syncs on read accesses.  Namely,
-      // - on discrete systems, if the (staging, buffer, image) trio, or
-      // - on UMA, if the (buffer, image) duo
-      // have gone out of sync as a result of one processor writing to one
-      // resource which is then either accessed as an another resource type on
-      // the same or another processor.  Same considerations regarding hazard
-      // avoidance as above applies.
-
-      using Future = vTensor::Future<const void, vTensor::Access::Read>;
-      const Future v_src_future = v_src.host<const void>();
+      const vTensor& v_src = convert(src);

      // Vulkan -> CPU
-      if (at::kCPU == self.device().type()) {
+      if (self.device().is_cpu()) {
+        // Similar notes as above applies, with the additional consideration of
+        // potential syncs on read accesses.  Namely,
+        // - on discrete systems, if the (staging, buffer, image) trio, or
+        // - on UMA, if the (buffer, image) duo
+        // have gone out of sync as a result of one processor writing to one
+        // resource which is then either accessed as an another resource type on
+        // the same or another processor.  Same considerations regarding hazard
+        // avoidance as above applies.
+
+        using Future = vTensor::Future<const void, vTensor::Access::Read>;
+        const Future v_src_future = v_src.host<const void>(command_buffer);
+
+        // Ideally we would have been able to put as much distance between
+        // requesting the data - a call to host() - and accessing the data
+        // - a call to wait() - but a local view of the computation graph
+        // in eager mode makes that optimization non-trivial.
+
        // This wait() is a no-op if data is not out of sync.  More often than
        // not though, waits here are expected as the GPU catches up with
        // compute submitted from CPU.
@ -99,51 +108,56 @@ Tensor& copy_(Tensor& self, const Tensor& src) {
            std::min(src.nbytes(), self.nbytes()));
      }
      else {
-        TORCH_INTERNAL_ASSERT(false, "Unsupported!");
+        TORCH_CHECK(false, "Unsupported!");
      }
+
+      //
+      // WARNING
+      //
+
+      // This is not great.  We almost never want to flush the GPU pipeline as
+      // that has far reaching consequences, especially if PyTorch is not the only
+      // process accessing the GPU.  If we have done our job properly, above
+      // synchronization mechanisms should be enough to ensure correctness at a more
+      // modest cost, as there is no need to flush the entirety of jobs in flight
+      // if one is only interested on waiting on computation affecting one single
+      // tensor to finish.
+      //
+      // Having said that, we still do need to release all pool resources at one
+      // point per inference run or we will run out of memory otherwise. There is
+      // no perfect answer to this problem that checks all boxes, which leaves us
+      // with one of several design decisions:
+      //
+      // 1) Use graph mode to gain an understanding of the computation graph,
+      //    itself allowing us to place pool purges intelligently.  Best option
+      //    for performance and memory consumption.  Not without its downsides if
+      //    flexibility is a top priority.
+      // 2) If on eager mode, and hence are seeing operations one at a time, expose
+      //    this release of resources to the user as a Python / C++ function.  This
+      //    makes for suboptimal user experience but is efficient in terms of
+      //    performance.
+      // 3) If on eager mode, and interested in keeping this bookkeeping transparent
+      //    to the user, release all resources somewhere ... like here.  This is
+      //    not ideal since it requires a pipeline flush to make sure these objects
+      //    are not already in use by a workload in flight.  Cannot do much better
+      //    within the constraints of this approach.  Good for user experience,
+      //    suboptimal for performance.
+      // 4) If on eager mode, and interested in keeping this bookkeeping transparent
+      //    to the user, and performance does not matter, make CPU and GPU run in
+      //    lockstep.  Obviously this is just bad.  Mentioned for the sake of
+      //    completeness.
+
+      context->flush();
+    }
+    else {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Invalid code path taken! Either the source or the destination tensor "
+          "was expected to be Vulkan a tensor!  Incorrect dispatch?");
    }
-
-    //
-    // WARNING
-    //
-
-    // This is not great.  We almost never want to flush the GPU pipeline as
-    // that has far reaching consequences, especially if PyTorch is not the only
-    // process accessing the GPU.  If we have done our job properly, above
-    // synchronization mechanisms should be enough to ensure correctness at a more
-    // modest cost, as there is no need to flush the entirety of jobs in flight
-    // if one is only interested on waiting on computation affecting one single
-    // tensor to finish.
-    //
-    // Having said that, we still do need to release all pool resources at one
-    // point per inference run or we will run out of memory otherwise. There is
-    // no perfect answer to this problem that checks all boxes, which leaves us
-    // with one of several design decisions:
-    //
-    // 1) Use graph mode to gain an understanding of the computation graph,
-    //    itself allowing us to place pool purges intelligently.  Best option
-    //    for performance and memory consumption.  Not without its downsides if
-    //    flexibility is a top priority.
-    // 2) If on eager mode, and hence are seeing operations one at a time, expose
-    //    this release of resources to the user as a Python / C++ function.  This
-    //    makes for suboptimal user experience but is efficient in terms of
-    //    performance.
-    // 3) If on eager mode, and interested in keeping this bookkeeping transparent
-    //    to the user, release all resources somewhere ... like here.  This is
-    //    not ideal since it requires a pipeline flush to make sure these objects
-    //    are not already in use by a workload in flight.  Cannot do much better
-    //    within the constraints of this approach.  Good for user experience,
-    //    suboptimal for performance.
-    // 4) If on eager mode, and interested in keeping this bookkeeping transparent
-    //    to the user, and performance does not matter, make CPU and GPU run in
-    //    lockstep.  Obviously this is just bad.  Mentioned for the sake of
-    //    completeness.
-
-    api::context()->flush();
-  }
-  else {
-    TORCH_INTERNAL_ASSERT(false, "Unsupported!");
  }
+  // No queue submission here.  All queue submissions must have been handled
+  // above either explicitly or as a result of calling tensor.host().

  return self;
 }
--- a/aten/src/ATen/native/vulkan/ops/Mean.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp
@ -52,11 +52,11 @@ Tensor mean(
    v_input.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_input.has_image()) {
-      const struct {
+    if C10_LIKELY(v_input.has_image()) {
+      const struct Block final {
        uvec3 extents;
        int32_t range;
        ivec2 iextents;
@ -71,63 +71,35 @@ Tensor mean(
        },
      };

-      if (keepdim) {
-        context->dispatch(
-            command_buffer,
-            {
-              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-            },
-            VK_KERNEL(mean),
-            v_output.extents(),
-            // Write-only access bypasses synchronization but inserts appropriate
-            // barriers if necessary.
-            v_output.image(
-                command_buffer,
-                vTensor::Stage::Compute,
-                vTensor::Access::Write),
-            // Read-only access is implied on const tensors and triggers an async
-            // synchronization if necessary.
-            v_input.image(
-                command_buffer,
-                vTensor::Stage::Compute),
-            // Object lifetime is managed by the resource pool.
-            // It is OK not to keep track of the handle.
-            context->resource().pool.uniform(block).object);
-      }
-      else {
-        context->dispatch(
-            command_buffer,
-            {
-              VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-              VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-              VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-            },
-            VK_KERNEL(mean2d),
-            v_output.extents(),
-            // Write-only access bypasses synchronization but inserts appropriate
-            // barriers if necessary.
-            v_output.image(
-                command_buffer,
-                vTensor::Stage::Compute,
-                vTensor::Access::Write),
-            // Read-only access is implied on const tensors and triggers an async
-            // synchronization if necessary.
-            v_input.image(
-                command_buffer,
-                vTensor::Stage::Compute),
-            // Object lifetime is managed by the resource pool.
-            // It is OK not to keep track of the handle.
-            context->resource().pool.uniform(block).object);
-      }
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          keepdim ? VK_KERNEL(mean) : VK_KERNEL(mean2d),
+          v_output.extents(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_input.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
    }
    else {
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@ -10,18 +10,21 @@ namespace {
 using namespace api::utils;

 vTensor pack_weights(
-  api::Resource::Pool& pool,
-  const Tensor& weight_arg) {
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg) {
  if (weight_arg.is_vulkan()) {
    return convert(weight_arg);
  }

+  api::Context* const context = api::context();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();
+
  const Tensor weight = weight_arg.contiguous();
  const IntArrayRef w_sizes = weight.sizes();
  const float* const src_weight_ptr = weight.data_ptr<float>();

  vTensor v_weight{
-      api::context(),
+      context,
      &pool,
      w_sizes,
      weight.options(),
@ -29,7 +32,7 @@ vTensor pack_weights(

  {
    using Future = vTensor::Future<void, vTensor::Access::Write>;
-    Future v_weight_future = v_weight.host<void, vTensor::Access::Write>();
+    Future v_weight_future = v_weight.host<void, vTensor::Access::Write>(command_buffer);
    Future::Payload v_weight_payload = v_weight_future.wait();

    memcpy(
@ -49,16 +52,21 @@ vTensor pack_biases(
    return convert(*bias_arg);
  }

+  api::Context* const context = api::context();
+  api::Command::Buffer& command_buffer = context->command().pool.stream();
+
  vTensor v_bias{
-      api::context(),
+      context,
      &pool,
-      {weight_arg.sizes()[Layout::Parameter::width]},
+      {
+          weight_arg.size(Layout::Parameter::width),
+      },
      weight_arg.options(),
  };

  {
    using Future = vTensor::Future<void, vTensor::Access::Write>;
-    Future v_bias_future = v_bias.host<void, vTensor::Access::Write>();
+    Future v_bias_future = v_bias.host<void, vTensor::Access::Write>(command_buffer);
    Future::Payload v_bias_payload = v_bias_future.wait();

    if (bias_arg) {
@ -66,7 +74,8 @@ vTensor pack_biases(
          v_bias_payload.get(),
          bias_arg->contiguous().data_ptr<float>(),
          std::min(bias_arg->nbytes(), v_bias.nbytes()));
-    } else {
+    }
+    else {
      memset(
          v_bias_payload.get(),
          // 2's complement integers and IEEE-754 floating point numbers both
@ -162,11 +171,11 @@ Tensor mm(
      mat1.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_mat1.has_image() && v_mat2.has_image()) {
-      const struct {
+    if C10_LIKELY(v_mat1.has_image() && v_mat2.has_image()) {
+      const struct Block final {
        uvec3 size;
        int32_t K;
      } block {
@ -203,12 +212,12 @@ Tensor mm(
          // Object lifetime is managed by the resource pool.
          // It is OK not to keep track of the handle.
          context->resource().pool.uniform(block).object);
-    } else {
+    }
+    else {
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
@ -281,14 +290,15 @@ Tensor LinearOpContext::run(
      input.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_output.has_image() &&
+    if C10_LIKELY(
+        v_output.has_image() &&
        v_input.has_image() &&
        packed_.v_weight.has_image() &&
        packed_.v_bias.has_image()) {
-      const struct {
+      const struct Block final {
        uvec3 size;
        int32_t K;
        vec2 multiplier;
@ -341,8 +351,7 @@ Tensor LinearOpContext::run(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
--- a/aten/src/ATen/native/vulkan/ops/Mul.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mul.cpp
@ -23,11 +23,11 @@ Tensor mul_scalar(
    v_self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_output.has_image() && v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        float other;
      } block {
@ -63,8 +63,7 @@ Tensor mul_scalar(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
@ -80,11 +79,11 @@ Tensor& mul_scalar_(

  vTensor& v_self = convert(self);

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        float other;
      } block {
@ -114,8 +113,7 @@ Tensor& mul_scalar_(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return self;
 }
--- a/aten/src/ATen/native/vulkan/ops/Pool.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@ -33,10 +33,10 @@ Tensor adaptive_avg_pool2d(
    v_self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_self.has_image()) {
+    if C10_LIKELY(v_self.has_image()) {
      const uvec3 v_output_size = v_output.extents();
      const uvec3 v_self_size = v_self.extents();

@ -45,7 +45,7 @@ Tensor adaptive_avg_pool2d(
        static_cast<float>(v_self_size.data[1u]) / v_output_size.data[1u],
      };

-      const struct {
+      const struct Block final {
        uvec3 size;
        uint32_t _;
        vec2 stride;
@ -88,8 +88,7 @@ Tensor adaptive_avg_pool2d(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
@ -171,13 +170,11 @@ Tensor avg_pool2d(
    v_self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    using namespace utils;
-
-    if (v_self.has_image()) {
-      const struct {
+    if C10_LIKELY(v_self.has_image()) {
+      const struct Block final {
        uvec3 extents;
        int32_t range;
        ivec2 iextents;
@ -235,8 +232,7 @@ Tensor avg_pool2d(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
--- a/aten/src/ATen/native/vulkan/ops/Shape.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp
@ -21,8 +21,8 @@ Tensor view(
    self.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
    command_buffer.copy(
        // Read-only access is implied on const tensors and triggers an async
@ -37,8 +37,7 @@ Tensor view(
            vTensor::Stage::Transfer,
            vTensor::Access::Write));
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
--- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@ -419,31 +419,19 @@ vTensor::vTensor(
    }) {
 }

-const vTensor* vTensor::host() const {
-  view_->staging(Stage::Host, Access::Read);
+const vTensor* vTensor::host(
+    api::Command::Buffer& command_buffer) const {
+  view_->staging(command_buffer, Stage::Host, Access::Read);
  return this;
 }

-vTensor* vTensor::host(const Access::Flags access) {
-  view_->staging(Stage::Host, access);
+vTensor* vTensor::host(
+    api::Command::Buffer& command_buffer,
+    const Access::Flags access) {
+  view_->staging(command_buffer, Stage::Host, access);
  return this;
 }

-vTensor::Buffer::Object vTensor::buffer(
-  const Stage::Flags stage) const & {
-  return view_->buffer(
-      stage,
-      Access::Read).object;
-}
-
-vTensor::Buffer::Object vTensor::buffer(
-    const Stage::Flags stage,
-    const Access::Flags access) & {
-  return view_->buffer(
-      stage,
-      access).object;
-}
-
 vTensor::Buffer::Object vTensor::buffer(
    api::Command::Buffer& command_buffer,
    const Stage::Flags stage) const & {
@ -463,21 +451,6 @@ vTensor::Buffer::Object vTensor::buffer(
      access).object;
 }

-vTensor::Image::Object vTensor::image(
-    const Stage::Flags stage) const & {
-  return view_->image(
-      stage,
-      Access::Read).object;
-}
-
-vTensor::Image::Object vTensor::image(
-    const Stage::Flags stage,
-    const Access::Flags access) & {
-  return view_->image(
-      stage,
-      access).object;
-}
-
 vTensor::Image::Object vTensor::image(
    api::Command::Buffer& command_buffer,
    const Stage::Flags stage) const & {
@ -535,16 +508,8 @@ vTensor::View::View(
  ops::verify(options);
 }

-// We typically do not know whether we need a command buffer to service a request
-// until we have perfomed a bunch of checks in nested logic, and even then we
-// may end up with the always issued state transition optimized away under
-// certain conditions, which makes a policy of always allocating a command buffer
-// up front, only to end up using it at times, a wasteful approach.  This class
-// answers that need.
-
 class vTensor::View::CMD final {
 public:
-  explicit CMD(const View&);
  CMD(const View&, api::Command::Buffer&);
  CMD(const CMD&) = delete;
  CMD& operator=(const CMD&) = delete;
@ -578,60 +543,18 @@ class vTensor::View::CMD final {
      const Image::Object& image,
      Buffer::Object& buffer);

-  void submit(Fence fence = {});
-
- private:
-  api::Command::Buffer& command_buffer();
+  void submit(Fence fence);

 private:
  const View& view_;
-
-  enum class Type {
-    Internal,
-    External,
-  } type;
-
-  union _ final {
-    api::Command::Buffer internal;
-    api::Command::Buffer* external;
-    ~_() {}
-  } command_buffer_;
+  api::Command::Buffer& command_buffer_;
 };

-vTensor::View::CMD::CMD(
-    const View& view)
-  : view_(view),
-    type(Type::Internal),
-    command_buffer_{} {
-}
-
 vTensor::View::CMD::CMD(
    const View& view,
-    api::Command::Buffer& external)
+    api::Command::Buffer& command_buffer)
  : view_(view),
-    type(Type::External),
-    command_buffer_{
-      .external = &external,
-    } {
-}
-
-api::Command::Buffer& vTensor::View::CMD::command_buffer() {
-  switch (type) {
-    case Type::Internal:
-      if (!command_buffer_.internal) {
-        command_buffer_.internal = view_.context_->command().pool.allocate();
-        command_buffer_.internal.begin();
-      }
-
-      return command_buffer_.internal;
-
-    case Type::External:
-      return *(command_buffer_.external);
-
-    default:
-      TORCH_INTERNAL_ASSERT(false, "Unknown command buffer type!");
-      break;
-  }
+    command_buffer_(command_buffer) {
 }

 void vTensor::View::CMD::barrier(State::Transition transition) {
@ -761,7 +684,7 @@ void vTensor::View::CMD::barrier(State::Transition transition) {
      barrier.stage.src = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
    }

-    command_buffer().barrier(barrier);
+    command_buffer_.barrier(barrier);
  }
 }

@ -789,7 +712,7 @@ void vTensor::View::CMD::copy_buffer_to_staging(
          {},
        }));

-  command_buffer().copy(buffer, staging);
+  command_buffer_.copy(buffer, staging);
 }

 void vTensor::View::CMD::copy_staging_to_buffer(
@ -816,7 +739,7 @@ void vTensor::View::CMD::copy_staging_to_buffer(
          {},
        }));

-  command_buffer().copy(staging, buffer);
+  command_buffer_.copy(staging, buffer);
 }

 void vTensor::View::CMD::copy_buffer_to_image(
@ -847,7 +770,7 @@ void vTensor::View::CMD::copy_buffer_to_image(
  const uvec3 extents = view_.extents();
  const uint32_t plane = extents.data[0u] * extents.data[1u];

-  const struct {
+  const struct Block final {
    uvec3 extents;
    uint32_t block;
    uvec4 offset;
@ -863,7 +786,7 @@ void vTensor::View::CMD::copy_buffer_to_image(
  };

  view_.context_->dispatch(
-      command_buffer(),
+      command_buffer_,
      {
        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@ -904,7 +827,7 @@ void vTensor::View::CMD::copy_image_to_buffer(
  const uvec3 extents = view_.extents();
  const uint32_t plane = extents.data[0u] * extents.data[1u];

-  const struct {
+  const struct Block final {
    uvec3 extents;
    uint32_t block;
    uvec4 offset;
@ -920,7 +843,7 @@ void vTensor::View::CMD::copy_image_to_buffer(
  };

  view_.context_->dispatch(
-      command_buffer(),
+      command_buffer_,
      {
        VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@ -934,10 +857,10 @@ void vTensor::View::CMD::copy_image_to_buffer(
 }

 void vTensor::View::CMD::submit(const api::Resource::Fence fence) {
-  if ((Type::Internal == type) && command_buffer_.internal) {
-    command_buffer_.internal.end();
-    command_buffer_.internal.submit(view_.context_->gpu().queue, fence);
-  }
+  view_.context_->command().pool.submit(
+      view_.context_->gpu().queue,
+      command_buffer_,
+      fence);
 }

 vTensor::Buffer& vTensor::View::buffer() const {
@ -953,38 +876,28 @@ vTensor::Buffer& vTensor::View::buffer() const {
 }

 vTensor::Buffer& vTensor::View::buffer(
+    api::Command::Buffer& command_buffer,
    const Stage::Flags stage,
    const Access::Flags access) const {
-  CMD command_buffer(*this);
-  Buffer& buffer = this->buffer(command_buffer, stage, access);
-  command_buffer.submit();
-
-  return buffer;
+  CMD cmd(*this, command_buffer);
+  return buffer(cmd, stage, access);
 }

 vTensor::Buffer& vTensor::View::buffer(
-    api::Command::Buffer& command_buffer_,
-    const Stage::Flags stage,
-    const Access::Flags access) const {
-  CMD command_buffer(*this, command_buffer_);
-  return buffer(command_buffer, stage, access);
-}
-
-vTensor::Buffer& vTensor::View::buffer(
-    CMD& command_buffer,
+    CMD& cmd,
    const Stage::Flags stage,
    const Access::Flags access) const {
  if ((access & Access::Read) && state_.is_dirty(Component::Buffer)) {
    if (state_.is_clean(Component::Staging)) {
-      command_buffer.copy_staging_to_buffer(
+      cmd.copy_staging_to_buffer(
          state_,
-          staging(command_buffer, Stage::Transfer, Access::Read).object,
+          staging(cmd, Stage::Transfer, Access::Read).object,
          buffer().object);
    }
    else if (state_.is_clean(Component::Image)) {
-      command_buffer.copy_image_to_buffer(
+      cmd.copy_image_to_buffer(
          state_,
-          image(command_buffer, Stage::Compute, Access::Read).object,
+          image(cmd, Stage::Compute, Access::Read).object,
          buffer().object);
    }
    else {
@ -994,7 +907,7 @@ vTensor::Buffer& vTensor::View::buffer(
    }
  }

-  command_buffer.barrier(
+  cmd.barrier(
      state_.transition({
          // Staging
          {},
@ -1028,35 +941,25 @@ vTensor::Image& vTensor::View::image() const {
 }

 vTensor::Image& vTensor::View::image(
+    api::Command::Buffer& command_buffer,
    const Stage::Flags stage,
    const Access::Flags access) const {
-  CMD command_buffer(*this);
-  Image& image = this->image(command_buffer, stage, access);
-  command_buffer.submit();
-
-  return image;
+  CMD cmd(*this, command_buffer);
+  return image(cmd, stage, access);
 }

 vTensor::Image& vTensor::View::image(
-    api::Command::Buffer& command_buffer_,
-    const Stage::Flags stage,
-    const Access::Flags access) const {
-  CMD command_buffer(*this, command_buffer_);
-  return image(command_buffer, stage, access);
-}
-
-vTensor::Image& vTensor::View::image(
-    CMD& command_buffer,
+    CMD& cmd,
    const Stage::Flags stage,
    const Access::Flags access) const {
  if ((access & Access::Read) && state_.is_dirty(Component::Image)) {
-    command_buffer.copy_buffer_to_image(
+    cmd.copy_buffer_to_image(
        state_,
-        buffer(command_buffer, stage, Access::Read).object,
+        buffer(cmd, stage, Access::Read).object,
        image().object);
  }

-  command_buffer.barrier(
+  cmd.barrier(
      state_.transition({
          // Staging
          {},
@ -1096,27 +999,28 @@ vTensor::Buffer& vTensor::View::staging() const {
 }

 vTensor::Buffer& vTensor::View::staging(
+    api::Command::Buffer& command_buffer,
    const Stage::Flags stage,
    const Access::Flags access) const {
-  CMD command_buffer(*this);
-  Buffer& staging = this->staging(command_buffer, stage, access);
-  command_buffer.submit(fence());
+  CMD cmd(*this, command_buffer);
+  Buffer& staging = this->staging(cmd, stage, access);
+  cmd.submit(fence(access));

  return staging;
 }

 vTensor::Buffer& vTensor::View::staging(
-    CMD& command_buffer,
+    CMD& cmd,
    const Stage::Flags stage,
    const Access::Flags access) const {
  if ((access & Access::Read) && state_.is_dirty(Component::Staging)) {
-    command_buffer.copy_buffer_to_staging(
+    cmd.copy_buffer_to_staging(
        state_,
-        buffer(command_buffer, Stage::Transfer, Access::Read).object,
+        buffer(cmd, Stage::Transfer, Access::Read).object,
        staging().object);
  }

-  command_buffer.barrier(
+  cmd.barrier(
      state_.transition({
          // Staging
          {
@ -1138,6 +1042,14 @@ vTensor::Buffer& vTensor::View::staging(
  return staging();
 }

+vTensor::Fence& vTensor::View::fence(const Access::Flags access) const {
+  if (access & Access::Read) {
+    fence_ = allocate_fence(&context_->resource().pool);
+  }
+
+  return fence_;
+}
+
 vTensor::Memory& vTensor::View::wait() const {
  if (fence_) {
    fence_.wait();
@ -1146,10 +1058,6 @@ vTensor::Memory& vTensor::View::wait() const {
  return staging().memory;
 }

-vTensor::Fence& vTensor::View::fence() const {
-  return (fence_ = allocate_fence(pool_));
-}
-
 void vTensor::View::verify() const {
  TORCH_INTERNAL_ASSERT(!image_ || state_.is_available(Component::Image));
  TORCH_INTERNAL_ASSERT(!staging_ || state_.is_discrete());
--- a/aten/src/ATen/native/vulkan/ops/Tensor.h
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.h
@ -157,10 +157,10 @@ class vTensor final {
  */

  template<typename Type>
-  Future<Type, Access::Read> host() const &;
+  Future<Type, Access::Read> host(api::Command::Buffer&) const &;

  template<typename Type, Access::Flags kAccess>
-  Future<Type, kAccess> host() &;
+  Future<Type, kAccess> host(api::Command::Buffer&) &;

  /*
    Device access - these functions will be expensive if they trigger a buffer
@ -178,14 +178,10 @@ class vTensor final {
    predictability of usage and efficiency.
  */

-  Buffer::Object buffer(Stage::Flags) const &;
-  Buffer::Object buffer(Stage::Flags, Access::Flags) &;
  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const &;
  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) &;

  bool has_image() const;
-  Image::Object image(Stage::Flags) const &;
-  Image::Object image(Stage::Flags, Access::Flags) &;
  Image::Object image(api::Command::Buffer&, Stage::Flags) const &;
  Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) &;

@ -210,26 +206,22 @@ class vTensor final {
    Host
  */

-  const vTensor* host() const;
-  vTensor* host(Access::Flags access);
+  const vTensor* host(api::Command::Buffer&) const;
+  vTensor* host(api::Command::Buffer&, Access::Flags);

  template<typename Type>
-  Future<Type, Access::Read> host() const && = delete;
+  Future<Type, Access::Read> host(api::Command::Buffer&) const && = delete;

  template<typename Type, Access::Flags kAccess>
-  Future<Type, kAccess> host() && = delete;
+  Future<Type, kAccess> host(api::Command::Buffer&) && = delete;

  /*
    Device
  */

-  Buffer::Object buffer(Stage::Flags) const && = delete;
-  Buffer::Object buffer(Stage::Flags, Access::Flags) && = delete;
  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const && = delete;
  Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;

-  Image::Object image(Stage::Flags) const && = delete;
-  Image::Object image(Stage::Flags, Access::Flags) && = delete;
  Image::Object image(api::Command::Buffer&, Stage::Flags) const && = delete;
  Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete;

@ -249,21 +241,22 @@ class vTensor final {
    ~View() = default;

    /*
-      Device
+      Buffer
    */

-    Buffer& buffer(Stage::Flags, Access::Flags) const;
    Buffer& buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) const;

+    /*
+      Image
+    */
+
    bool has_image() const;
-    Image& image(Stage::Flags, Access::Flags) const;
    Image& image(api::Command::Buffer&, Stage::Flags, Access::Flags) const;

    /*
      Host
    */

-    Buffer& staging(Stage::Flags, Access::Flags) const;
    Buffer& staging(api::Command::Buffer&, Stage::Flags, Access::Flags) const;
    vTensor::Memory& wait() const;

@ -343,7 +336,7 @@ class vTensor final {
    Image& image(CMD&, Stage::Flags, Access::Flags) const;
    Buffer& staging() const;
    Buffer& staging(CMD&, Stage::Flags, Access::Flags) const;
-    Fence& fence() const;
+    Fence& fence(Access::Flags) const;

    // Validation
    void verify() const;
@ -485,13 +478,15 @@ vTensor::Future<Type, kAccess>::wait() const & {
 }

 template<typename Type>
-inline vTensor::Future<Type, vTensor::Access::Read> vTensor::host() const & {
-  return Future<Type, vTensor::Access::Read>(host());
+inline vTensor::Future<Type, vTensor::Access::Read>
+vTensor::host(api::Command::Buffer& command_buffer) const & {
+  return Future<Type, vTensor::Access::Read>(host(command_buffer));
 }

 template<typename Type, vTensor::Access::Flags kAccess>
-inline vTensor::Future<Type, kAccess> vTensor::host() & {
-  return Future<Type, kAccess>(host(kAccess));
+inline vTensor::Future<Type, kAccess>
+vTensor::host(api::Command::Buffer& command_buffer) & {
+  return Future<Type, kAccess>(host(command_buffer, kAccess));
 }

 inline bool vTensor::has_image() const {
--- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
@ -36,11 +36,11 @@ Tensor upsample_nearest2d(
    input.options(),
  };

-  api::Command::Buffer command_buffer = context->command().pool.allocate();
-  command_buffer.begin();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
  {
-    if (v_input.has_image()) {
-      const struct {
+    if C10_LIKELY(v_input.has_image()) {
+      const struct Block final {
        uvec3 extents;
        uint32_t _;
        ivec2 iextents;
@ -92,8 +92,7 @@ Tensor upsample_nearest2d(
      TORCH_CHECK(false, "Not implemented!");
    }
  }
-  command_buffer.end();
-  command_buffer.submit(context->gpu().queue);
+  command_pool.submit(context->gpu().queue, command_buffer);

  return convert(v_output);
 }
--- a/aten/src/ATen/native/vulkan/ops/Utils.h
+++ b/aten/src/ATen/native/vulkan/ops/Utils.h
@ -10,7 +10,7 @@ namespace vulkan {
 namespace ops {
 namespace utils {

-int64_t normalize(
+inline int64_t normalize(
    const int64_t dimension,
    const int64_t n) {
  return (dimension % n + n) % n;