diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index fbf664b5bc7..e97b3d368d1 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -420,7 +420,7 @@ set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_BENCHMARK_SRCS ${ATen_MOBILE_BENCHMARK_SRCS} PARENT_SCOPE)
-set(ATen_MOBILE_TEST_SRCS ${ATen_VEC256_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
+set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_TEST_SRCS ${ATen_QUANTIZED_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 4f089aad8df..0781eaad1ef 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -77,12 +77,13 @@ list(APPEND ATen_HIP_TEST_SRCS
 list(APPEND ATen_VULKAN_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
 
-list(APPEND ATen_VEC256_TEST_SRCS
-  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp)
+list(APPEND ATen_MOBILE_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
 
 # ---[ Send the lists to the parent scope.
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
-set(ATen_VEC256_TEST_SRCS ${ATen_VEC256_TEST_SRCS} PARENT_SCOPE)
+set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
new file mode 100644
index 00000000000..28a9b047652
--- /dev/null
+++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
@@ -0,0 +1,50 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cpu/vec256/vec256.h>
+#include <ATen/ATen.h>
+
+#include <c10/core/CPUCachingAllocator.h>
+
+TEST(CPUCachingAllocatorTest, check_alloc_free) {
+  c10::CPUCachingAllocator caching_allocator;
+  c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
+      &caching_allocator);
+  at::Tensor a = at::rand({23, 23});
+  float* data_ptr = a.data_ptr<float>();
+  a.reset();
+  a = at::rand({23, 23});
+  ASSERT_TRUE(data_ptr == a.data_ptr<float>());
+}
+
+// This should just free the pointer correctly.
+TEST(CPUCachingAllocatorTest, check_alloc_outside_free_inside) {
+  c10::CPUCachingAllocator caching_allocator;
+  at::Tensor a = at::rand({23, 23});
+  {
+    c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
+        &caching_allocator);
+    float* data_ptr = a.data_ptr<float>();
+    a.reset();
+    a = at::rand({23, 23});
+  }
+}
+
+TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) {
+  c10::CPUCachingAllocator caching_allocator;
+  at::Tensor a;
+  {
+    c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
+        &caching_allocator);
+    a = at::rand({23, 23});
+  }
+  a.reset();
+}
+
+int main(int argc, char* argv[]) {
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index f6464785e40..db78467cfb4 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -24,6 +24,8 @@
 #include "torch/csrc/jit/serialization/import.h"
 #include "torch/script.h"
 
+#include "c10/core/CPUCachingAllocator.h"
+
 #include <chrono>
 using namespace std::chrono;
 
@@ -45,6 +47,10 @@ C10_DEFINE_bool(
   no_inputs,
   false,
   "Whether the model has any input. Will ignore other input arugments if true");
+C10_DEFINE_bool(
+  use_caching_allocator,
+  false,
+  "Whether to cache allocations between inference iterations");
 C10_DEFINE_int(
     use_bundled_input,
     -1,
@@ -198,6 +204,11 @@ int main(int argc, char** argv) {
     std::cout << module.forward(inputs) << std::endl;
   }
 
+  c10::CPUCachingAllocator caching_allocator;
+  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
+  if (FLAGS_use_caching_allocator) {
+    caching_allocator_guard.emplace(&caching_allocator);
+  }
   std::cout << "Starting benchmark." << std::endl;
   std::cout << "Running warmup runs." << std::endl;
   CAFFE_ENFORCE(
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index 79342404624..e830aa4832d 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/CPUAllocator.h>
+#include <c10/core/CPUCachingAllocator.h>
 #include <c10/core/DeviceType.h>
 
 // TODO: rename flags to C10
@@ -154,7 +155,15 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     }
     // TODO: enable with better TLS support on mobile
     // profiledCPUMemoryReporter().Delete(pointer);
-    c10::free_cpu(pointer);
+    auto allocator_ptr = GetThreadLocalCachingAllocator();
+    if (allocator_ptr != nullptr) {
+      allocator_ptr->free(pointer);
+    } else {
+      c10::free_cpu(pointer);
+      // This adds extra cost to freeing memory to the default case when
+      // caching allocator is not enabled.
+      CPUCachingAllocator::record_free(pointer);
+    }
   }
 
   virtual DataPtr allocate(const size_t nbytes) const override {
@@ -168,7 +177,13 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     }
 
     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
-    void* const data = c10::alloc_cpu(alloc_size);
+    void* data;
+    auto allocator_ptr = GetThreadLocalCachingAllocator();
+    if (allocator_ptr != nullptr) {
+      data = allocator_ptr->allocate(alloc_size);
+    } else {
+      data = c10::alloc_cpu(alloc_size);
+    }
     //  profiledCPUMemoryReporter().New(data, alloc_size);
     return {
         reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
diff --git a/c10/core/CPUCachingAllocator.cpp b/c10/core/CPUCachingAllocator.cpp
new file mode 100644
index 00000000000..2ef63cdb25e
--- /dev/null
+++ b/c10/core/CPUCachingAllocator.cpp
@@ -0,0 +1,105 @@
+#include <c10/core/CPUCachingAllocator.h>
+
+namespace c10 {
+
+namespace {
+thread_local CPUCachingAllocator* caching_allocator_ptr{nullptr};
+} // namespace
+
+std::mutex CPUCachingAllocator::mutex_;
+ska::flat_hash_map<void*, size_t> CPUCachingAllocator::allocation_map_;
+
+inline void* CPUCachingAllocator::allocate_and_cache(const size_t bytes) {
+  void* ptr;
+  try {
+    ptr = c10::alloc_cpu(bytes);
+  } catch (c10::Error& e) {
+    // If allocation fails, try freeing cached available blocks.
+    // For now free all available cached blocks.
+    free_cached();
+    // Furthermore to consider: If we ever come here running out of memory
+    // perhaps it is best to disable caching, since this is likely to happen
+    // again.
+    // Try again.
+    ptr = c10::alloc_cpu(bytes);
+  }
+  allocation_map_[ptr] = bytes;
+  return ptr;
+}
+
+void* CPUCachingAllocator::allocate(const size_t bytes) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = available_map_.find(bytes);
+  if (it == available_map_.end() || it->second.empty()) {
+    return allocate_and_cache(bytes);
+  }
+  return it->second.pop_back_val();
+}
+
+void CPUCachingAllocator::free(void* ptr) {
+  // NB: since we are not really freeing the memory
+  // the cases such as quantization code freeing original weights
+  // on mobile, will not quite work, as we likely will hold
+  // onto that memory.
+  // NB: We can also enable max memory cached for better memory
+  // management such that free will actually free the memory if
+  // we are nearing or above the watermark.
+  std::lock_guard<std::mutex> guard(mutex_);
+  // If this allocation was done before caching allocator was enabled
+  // then free regularly
+  const auto& it = allocation_map_.find(ptr);
+  if (it == allocation_map_.end()) {
+    c10::free_cpu(ptr);
+    return;
+  }
+  const size_t alloc_size = it->second;
+  available_map_[alloc_size].push_back(ptr);
+}
+
+void CPUCachingAllocator::record_free(void* ptr) {
+  // This function captures the case when the allocated memory
+  // is being freed outside the scope of this allocator.
+  // At the moment only way to capture this is to have the allocator,
+  // that uses this CachingAllocator as the backing allocator,
+  // call this function explicity upon freeing memory while
+  // outside the scope of caching allocator.
+  // If the memory is freed in some other way, then we will likely
+  // have undefined behavior or page fault. But this can be
+  // the case without caching allocator as well.
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = allocation_map_.find(ptr);
+  if (it != allocation_map_.end()) {
+    allocation_map_.erase(it);
+  }
+}
+
+void CPUCachingAllocator::free_cached() {
+  for (const auto& it : available_map_) {
+    for (const auto ptr : it.second) {
+      c10::free_cpu(ptr);
+      // When cached memory is return to OS, it must be removed
+      // from allocation_map.
+      allocation_map_.erase(ptr);
+    }
+  }
+  available_map_.clear();
+}
+
+CPUCachingAllocator::~CPUCachingAllocator() {
+  free_cached();
+}
+
+CPUCachingAllocator* GetThreadLocalCachingAllocator() {
+  return caching_allocator_ptr;
+}
+
+WithCPUCachingAllocatorGuard::WithCPUCachingAllocatorGuard(
+    CPUCachingAllocator* allocator) {
+  prev_caching_allocator_ptr_ = GetThreadLocalCachingAllocator();
+}
+
+WithCPUCachingAllocatorGuard::~WithCPUCachingAllocatorGuard() {
+  caching_allocator_ptr = prev_caching_allocator_ptr_;
+}
+
+} // namespace c10
diff --git a/c10/core/CPUCachingAllocator.h b/c10/core/CPUCachingAllocator.h
new file mode 100644
index 00000000000..c12a9bf4966
--- /dev/null
+++ b/c10/core/CPUCachingAllocator.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+
+class C10_API CPUCachingAllocator {
+  /*
+   * What it does:
+   * Caches all the allocations carried out by this allocator.
+   * Cache key is the size of the allocation.
+   * If requested size is found in the cache returns the cached pointer.
+   * What it does not do:
+   * No speculative allocation for any future allocations.
+   */
+  private:
+    // Invariants.
+    // 1. If memory is ever allocated via this allocator then
+    //    the pointer will exist in allocation_map_, unless the allocator
+    //    returned the memory to OS via free_cached.
+    //  1.1. Therefore even when the said memory is "freed" via this
+    //       allocator (and thus cached), it will continue to stay
+    //       in allocaiton_map_. Furthermore it will also exist in
+    //       available_map_. Thus an allocated memory pointer can be in both
+    //       allocation_map_ and available_map_ simultaneously.
+    // 2. Memory pointer maybe removed from allocation_map_, when it
+    //    is freed outside of the scope of this allocator, but was allocated
+    //    by this allocator.
+    // 3. Available map only contains that memory which was allocated
+    //    by this allocator and subsequently freed by this allocator.
+    // As a result of above invariants, allocated memory ptr cannot be in
+    // available_map_ unless it is in allocation_map_ as well.
+    ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
+    static ska::flat_hash_map<void*, size_t> allocation_map_;
+    // Since allocation_map, which is a global instance, is mutated/read via
+    // all public APIs we need a global mutex.
+    static std::mutex mutex_;
+    inline void* allocate_and_cache(const size_t bytes);
+    void free_cached();
+  public:
+    static void record_free(void* ptr);
+    // Checks the cache to see if allocation of size bytes can be found.
+    // If so return cached memory, else
+    // allocates memory, records it for caching and returns.
+    void* allocate(const size_t bytes);
+    // Checks if the memory being freed is was marked for allocation by
+    // an earlier call to allocate. If so cache the allocation.
+    // Otherwise free.
+    void free(void* ptr);
+    // Mainly for testing
+    ~CPUCachingAllocator();
+};
+
+CPUCachingAllocator* GetDefaultCPUCachingAllocator();
+
+bool ThreadLocalCachingAllocatorEnabled();
+CPUCachingAllocator* GetThreadLocalCachingAllocator();
+
+/*
+ * Usage pattern:
+ * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
+ *   std::make_unique<c10::CPUCachingAllocator>();
+ * {
+ * WithCPUCachingAllocatorGuard(caching_allocator.get());
+ * ...
+ * }
+ */
+
+class C10_API WithCPUCachingAllocatorGuard {
+  public:
+    WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
+    ~WithCPUCachingAllocatorGuard();
+  private:
+    CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
+};
+
+} // namespace c10
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index 8d06edf0e82..dc80e30db1c 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -36,6 +36,30 @@ class MobileDebugInfo : public c10::DebugInfoBase {
  private:
   std::string model_name_;
   std::string method_name_;
+  // TODO: Kimish
+  // If we launch a thread such as for at::launch, interepter continuation
+  // and if the caching allocator is enabled in the base thread
+  // then, in order to propagate this information, that is caching allocator
+  // is enabled, across thread boundaries we can use the mechanism provided
+  // by ThreadLocalDebugInfo
+  // Once the thread local MobileDebugInfo is accessible in the launched
+  // thread, it can be accessed in that thread and that thread can set
+  // its own thread local CachingAllocatorInfo.
+  // However, we cannot expect every launched thread to extract and set
+  // its own thread local copy of CachingAllocatorInfo.
+  // But this can be done in lite interpreter, where in the run method
+  // it can do info =
+  // c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::MOBILE_RUNTIME_INFO))
+  // .get_caching_allocator_info();
+  // GetThreadLocalCachingAllocatorInfo() = info;
+  // Other option is to have MobileDebugInfo itself be the place where thread
+  // local copy of CachingAllocatorInfo is stored. Then
+  // DefaultMobileCPUAllocator inspects this to decide if to use
+  // CachingAllocator. However, current lite interpreter does not support FORK,
+  // thus from the run method of lite interpreter we are not really gonna launch
+  // another instance of lite interpreter in a different thread. So for now not
+  // getting bothered about passing CachingAllocatorInfo across thread
+  // boundaries. c10::CachingAllocatorInfo caching_allocator_info;
   size_t op_idx_ = 0;
 };