Move absl_thread_pool to XLA as YnnThreadpool

PiperOrigin-RevId: 820544939
2025-12-06 12:20:11 +01:00 · 2025-10-16 22:51:49 -07:00 · 2025-10-16 22:51:49 -07:00 · 1ddcd859d3
commit 1ddcd859d3
parent c9d8d37611
6 changed files with 748 additions and 0 deletions
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@ -33,6 +33,7 @@ bazel_dep(name = "grpc-java", version = "1.75.0", repo_name = "DO_NOT_USE_grpc_j

 # TODO: publish an official version of rules_ml_toolchain to BCR
 bazel_dep(name = "rules_ml_toolchain")
+
 # To calculate integrity:
 # wget -O temp_module_archive.tar.gz <archive URL>
 # HASH=$(openssl dgst -sha256 -binary temp_module_archive.tar.gz | openssl base64 -A)
@ -101,6 +102,7 @@ use_repo(
    "pthreadpool",
    "rocm_device_libs",
    "shardy",
+    "slinky",
    "stablehlo",
    "triton",
 )
--- a/third_party/xla/third_party/extensions/third_party.bzl
+++ b/third_party/xla/third_party/extensions/third_party.bzl
@ -30,6 +30,7 @@ load("//third_party/rmm:workspace.bzl", rmm = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
 load("//third_party/shardy:workspace.bzl", shardy = "repo")
+load("//third_party/slinky:workspace.bzl", slinky = "repo")
 load("//third_party/spdlog:workspace.bzl", spdlog = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
@ -68,6 +69,7 @@ def _third_party_ext_impl(mctx):  # @unused
    robin_map()
    rocm_device_libs()
    shardy()
+    slinky()
    spdlog()
    stablehlo()
    tensorrt()
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@ -205,6 +205,32 @@ xla_cc_test(
    ],
 )

+xla_cc_test(
+    name = "ynn_threadpool_test",
+    srcs = ["ynn_threadpool_test.cc"],
+    deps = [
+        ":ynn_threadpool",
+        "//xla/tsl/platform:env",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "ynn_threadpool",
+    srcs = ["ynn_threadpool.cc"],
+    hdrs = ["ynn_threadpool.h"],
+    deps = [
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@slinky//slinky/base:thread_pool",
+    ],
+)
+
 cc_library(
    name = "constant_allocation",
    srcs = ["constant_allocation.cc"],
--- a/third_party/xla/xla/backends/cpu/ynn_threadpool.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_threadpool.cc
@ -0,0 +1,558 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/ynn_threadpool.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/optimization.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/fixed_array.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/synchronization/mutex.h"
+#include "slinky/base/function_ref.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+
+#define EIGEN_USE_THREADS
+#include "Eigen/ThreadPool"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla::cpu {
+
+//===----------------------------------------------------------------------===//
+// work_queue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Forward declare.
+class worker;
+
+// A work queue that partitions `num_work_items` work items into
+// `num_partitions` partitions processed by parallel workers.
+class work_queue {
+ public:
+  work_queue(size_t num_work_items, size_t num_partitions);
+
+  // Returns the next work item in the given partition. Returns std::nullopt
+  // if the partition is complete.
+  std::optional<size_t> pop_work_item(size_t partition_index);
+
+  // Return the partition [begin, end) work item range.
+  std::pair<size_t, size_t> partition_range(size_t partition_index) const;
+
+  size_t num_partitions() const { return partitions_.size(); }
+
+  // If work queue is empty, it means that all work items are being processed by
+  // the workers, and the task will be done once all workers complete.
+  bool is_empty() const { return empty_.load(std::memory_order_relaxed); }
+
+ private:
+  friend class worker;
+
+  // Work items partition tracking the next work item to process.
+  struct partition {
+    void initialize(size_t begin, size_t end);
+
+    // Tracks index of the next work item in the assigned partition.
+    ABSL_CACHELINE_ALIGNED std::atomic<size_t> index;
+    size_t begin;
+    size_t end;
+  };
+
+  void set_empty() { empty_.store(true, std::memory_order_relaxed); }
+
+  absl::FixedArray<partition, 32> partitions_;
+  ABSL_CACHELINE_ALIGNED std::atomic<bool> empty_;
+};
+
+}  // namespace
+
+void work_queue::partition::initialize(size_t begin, size_t end) {
+  index.store(begin, std::memory_order_relaxed);
+  this->begin = begin;
+  this->end = end;
+}
+
+work_queue::work_queue(size_t num_work_items, size_t num_partitions)
+    : partitions_(num_partitions), empty_(num_work_items == 0) {
+  size_t partition_size = num_work_items / num_partitions;
+  size_t rem_work = num_work_items % num_partitions;
+  for (size_t i = 0, begin = 0, end = 0; i < num_partitions; ++i, begin = end) {
+    end = begin + partition_size + ((i < rem_work) ? 1 : 0);
+    partitions_[i].initialize(begin, end);
+  }
+}
+
+std::optional<size_t> work_queue::pop_work_item(size_t partition_index) {
+  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
+  partition& partition = partitions_.data()[partition_index];
+
+  // Check if partition is already empty.
+  if (size_t index = partition.index.load(std::memory_order_relaxed);
+      ABSL_PREDICT_FALSE(index >= partition.end)) {
+    return std::nullopt;
+  }
+
+  // Try to acquire the next work item in the partition.
+  size_t index = partition.index.fetch_add(1, std::memory_order_relaxed);
+  return ABSL_PREDICT_FALSE(index >= partition.end) ? std::nullopt
+                                                    : std::make_optional(index);
+}
+
+std::pair<size_t, size_t> work_queue::partition_range(
+    size_t partition_index) const {
+  DCHECK(partition_index < partitions_.size()) << "Invalid partition index";
+  return {partitions_[partition_index].begin, partitions_[partition_index].end};
+}
+
+//===----------------------------------------------------------------------===//
+// worker
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Worker processes work items from the work queue starting from the assigned
+// work partition. Once the assigned partition is complete it tries to pop
+// the work item from the next partition. Once the work queue is empty (the
+// worker wraps around to the initial partition) it returns and empty work item.
+class worker {
+ public:
+  worker(size_t partition_index, work_queue* queue);
+
+  std::optional<size_t> pop_work_item();
+
+ private:
+  size_t initial_partition_index_;
+  size_t partition_index_;
+  work_queue* queue_;
+};
+
+}  // namespace
+
+worker::worker(size_t partition_index, work_queue* queue)
+    : initial_partition_index_(partition_index),
+      partition_index_(partition_index),
+      queue_(queue) {}
+
+std::optional<size_t> worker::pop_work_item() {
+  std::optional<size_t> work = queue_->pop_work_item(partition_index_);
+  if (ABSL_PREDICT_TRUE(work)) {
+    return work;
+  }
+
+  while (!work.has_value() && !queue_->is_empty()) {
+    // Wrap around to the first partition.
+    if (ABSL_PREDICT_FALSE(++partition_index_ >= queue_->num_partitions())) {
+      partition_index_ = 0;
+    }
+
+    // We checked all partitions and got back to the partition we started from.
+    if (ABSL_PREDICT_FALSE(partition_index_ == initial_partition_index_)) {
+      queue_->set_empty();
+      break;
+    }
+
+    work = queue_->pop_work_item(partition_index_);
+  }
+
+  return work;
+}
+
+//===----------------------------------------------------------------------===//
+// task_impl
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Running a task can result in three states:
+//
+//   kPending:  The task is still being processed by the worker threads.
+//   kComplete: The caller thread is the one who completed the task.
+//   kDone:     The task is done and all work items have been processed, however
+//              the caller thread did't process any work items.
+//
+// We need this state to signal the waiter thread just once, from a thread that
+// completed the task.
+enum class task_state { kPending, kComplete, kDone };
+
+class task_impl final : public YnnThreadpool::task {
+ public:
+  task_impl(YnnThreadpool::task_body body, size_t num_work_items,
+            size_t num_partitions);
+
+  // Runs this task by process work items in the current thread.
+  task_state run();
+
+  int64_t num_workers() const;
+  bool is_empty_work_queue() const;
+  bool done() const final;
+
+ private:
+  YnnThreadpool::task_body body_;
+  work_queue work_queue_;
+
+  ABSL_CACHELINE_ALIGNED std::atomic<size_t> worker_index_;
+  ABSL_CACHELINE_ALIGNED std::atomic<size_t> pending_work_items_;
+};
+
+}  // namespace
+
+task_impl::task_impl(YnnThreadpool::task_body body, size_t num_work_items,
+                     size_t num_partitions)
+    : body_(std::move(body)),
+      work_queue_(num_work_items, num_partitions),
+      worker_index_(0),
+      pending_work_items_(num_work_items) {}
+
+task_state task_impl::run() {
+  // If we have more workers joining the task than the number of partitions,
+  // then we have to wrap around to the first partition.
+  size_t worker_index = worker_index_.fetch_add(1, std::memory_order_relaxed);
+  if (ABSL_PREDICT_FALSE(worker_index >= work_queue_.num_partitions())) {
+    worker_index %= work_queue_.num_partitions();
+  }
+
+  // Each worker processes the body using its own copy of the task.
+  worker w(worker_index, &work_queue_);
+  size_t num_processed_work_items = 0;
+
+  if (std::optional<size_t> item = w.pop_work_item()) {
+    YnnThreadpool::task_body body = body_;
+
+    do {
+      body(*item);
+      ++num_processed_work_items;
+    } while ((item = w.pop_work_item()).has_value());
+  }
+
+  // The number of pending work items should never go below zero.
+  size_t previous_work_items = pending_work_items_.fetch_sub(
+      num_processed_work_items, std::memory_order_acq_rel);
+  DCHECK_GE(previous_work_items, num_processed_work_items);
+
+  // Task is done if we have no more work items to process. Task is complete if
+  // we are the one who processed the last work item.
+  bool is_done = previous_work_items == num_processed_work_items;
+  bool is_complete = is_done && num_processed_work_items > 0;
+
+  return is_complete ? task_state::kComplete
+         : is_done   ? task_state::kDone
+                     : task_state::kPending;
+}
+
+int64_t task_impl::num_workers() const {
+  return worker_index_.load(std::memory_order_relaxed);
+}
+
+bool task_impl::is_empty_work_queue() const { return work_queue_.is_empty(); }
+
+bool task_impl::done() const {
+  return pending_work_items_.load(std::memory_order_acquire) == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// YnnThreadpool::impl
+//===----------------------------------------------------------------------===//
+
+// We keep a stack of tasks that are currently being processed by current
+// thread, to avoid recursive calls.
+static thread_local std::vector<const task_impl*> task_stack;  // NOLINT
+
+class YnnThreadpool::impl : public slinky::ref_counted<impl> {
+ public:
+  explicit impl(Eigen::ThreadPoolInterface* threadpool);
+
+  // Work on the single task and return the state of the task.
+  task_state work_on_task(task_impl* task);
+
+  // Work on all tasks in the queue. Returns when run out of tasks to process.
+  void work_on_tasks(const absl::Condition& condition);
+
+  // Enqueues a new task into the queue and returns a reference to it.
+  slinky::ref_count<task_impl> enqueue(YnnThreadpool::task_body body,
+                                       size_t num_work_items,
+                                       size_t num_partitions);
+
+  void await(const absl::Condition& condition);
+
+  void atomic_call(slinky::function_ref<void()> t);
+
+  // Returns true if we can schedule more workers into the underlying scheduler.
+  bool can_schedule_workers() const;
+
+  // Schedules the given number of workers for the given task. Worker scheduling
+  // uses recursive work splitting and early exit if the task does not need any
+  // more workers, of if we reached the maximum number of scheduled workers.
+  void schedule_workers(int64_t num_workers, slinky::ref_count<task_impl> task);
+
+  size_t thread_count() const { return thread_count_; }
+
+ private:
+  friend class slinky::ref_counted<impl>;
+  static void destroy(impl* ptr) { delete ptr; }
+
+  // A state of the work scheduling for a given task.
+  struct schedule_state : public slinky::ref_counted<schedule_state> {
+    schedule_state(int64_t remaining_workers, slinky::ref_count<task_impl> task,
+                   slinky::ref_count<impl> impl)
+        : remaining_workers(remaining_workers),
+          task(std::move(task)),
+          impl(std::move(impl)) {}
+
+    static void destroy(schedule_state* ptr) { delete ptr; }
+
+    std::atomic<int64_t> remaining_workers;
+    slinky::ref_count<task_impl> task;
+    slinky::ref_count<impl> impl;
+  };
+
+  // Worker scheduling function for the underlying scheduler.
+  template <bool release_impl_ref>
+  static void schedule_workers(schedule_state* context);
+
+  // Dequeues a pending task from the queue.
+  slinky::ref_count<task_impl> dequeue();
+
+  // Signals all waiter threads waiting on the waiter mutex.
+  void signal_waiters();
+
+  Eigen::ThreadPoolInterface* threadpool_;
+  size_t thread_count_;
+
+  std::deque<slinky::ref_count<task_impl>> tasks_ ABSL_GUARDED_BY(tasks_mutex_);
+
+  // A mutex for guarding mutable state accessed concurrently.
+  ABSL_CACHELINE_ALIGNED absl::Mutex tasks_mutex_;
+
+  // A mutex for signalling threads waiting on the tasks or conditions.
+  ABSL_CACHELINE_ALIGNED absl::Mutex waiter_mutex_;
+};
+
+YnnThreadpool::impl::impl(Eigen::ThreadPoolInterface* threadpool)
+    : threadpool_(threadpool),
+      thread_count_(threadpool_ ? threadpool_->NumThreads() : 0) {}
+
+slinky::ref_count<task_impl> YnnThreadpool::impl::enqueue(
+    YnnThreadpool::task_body body, size_t num_work_items,
+    size_t num_partitions) {
+  slinky::ref_count<task_impl> task(
+      new task_impl(std::move(body), num_work_items, num_partitions));
+
+  absl::MutexLock lock(tasks_mutex_);
+  return tasks_.emplace_back(std::move(task));
+}
+
+slinky::ref_count<task_impl> YnnThreadpool::impl::dequeue() {
+  absl::MutexLock lock(tasks_mutex_);
+
+  for (auto i = tasks_.begin(); i != tasks_.end();) {
+    slinky::ref_count<task_impl>& task = *i;
+
+    // Task doesn't have any more work items to process.
+    if (ABSL_PREDICT_FALSE(task->is_empty_work_queue())) {
+      i = tasks_.erase(i);
+      continue;
+    }
+
+    // Don't run the same task multiple times on the same thread.
+    if (ABSL_PREDICT_FALSE(absl::c_contains(task_stack, &*task))) {
+      ++i;
+      continue;
+    }
+
+    return task;
+  }
+
+  return nullptr;
+}
+
+task_state YnnThreadpool::impl::work_on_task(task_impl* task) {
+  DCHECK(absl::c_find(task_stack, task) == task_stack.end());
+
+  task_stack.push_back(task);
+  task_state state = task->run();
+  task_stack.pop_back();
+
+  // If we are the one who completed the task, we signal the waiters to wake upS
+  // any threads that are waiting for the task completion. If the task was
+  // completed by another worker, we do nothing to avoid the cost of waking up
+  // the same thread multiple times.
+  if (ABSL_PREDICT_FALSE(state == task_state::kComplete)) {
+    signal_waiters();
+  }
+
+  return state;
+}
+
+void YnnThreadpool::impl::work_on_tasks(const absl::Condition& condition) {
+  while (slinky::ref_count<task_impl> task = dequeue()) {
+    work_on_task(&*task);
+
+    if (ABSL_PREDICT_TRUE(condition.Eval())) {
+      return;
+    }
+  }
+}
+
+void YnnThreadpool::impl::await(const absl::Condition& condition) {
+  if (ABSL_PREDICT_FALSE(!condition.Eval())) {
+    absl::MutexLock lock(waiter_mutex_);
+    waiter_mutex_.Await(condition);
+  }
+}
+
+void YnnThreadpool::impl::signal_waiters() {
+  absl::MutexLock lock(waiter_mutex_);
+}
+
+void YnnThreadpool::impl::atomic_call(slinky::function_ref<void()> t) {
+  absl::MutexLock lock(waiter_mutex_);
+  t();
+}
+
+bool YnnThreadpool::impl::can_schedule_workers() const {
+  // One reference is owned by the parent YnnThreadpool, every other
+  // reference is owned by a worker scheduled into the underlying scheduler.
+  return ref_count() < 1 + thread_count();
+}
+
+void YnnThreadpool::impl::schedule_workers(int64_t num_workers,
+                                           slinky::ref_count<task_impl> task) {
+  if (ABSL_PREDICT_TRUE(num_workers > 0 && can_schedule_workers())) {
+    slinky::ref_count<schedule_state> state(
+        new schedule_state(num_workers - 1, std::move(task), {this}));
+    threadpool_->Schedule([state = state.take()]() {
+      schedule_workers</*release_impl_ref=*/false>(state);
+    });
+  }
+}
+
+template <bool release_impl_ref>
+void YnnThreadpool::impl::schedule_workers(schedule_state* context) {
+  auto state = slinky::ref_count<schedule_state>::assume(context);
+
+  // We recursively keep scheduling workers into the underlying scheduler.
+  // This is more efficient than scheduling them sequentially from a single
+  // thread, because workers can start processing the task sooner and we
+  // distribute thread wake-ups evenly across underlying threads.
+  static constexpr int32_t kNumRecursiveWorkers = 2;
+
+  for (size_t i = 0; i < kNumRecursiveWorkers; ++i) {
+    bool schedule_worker =
+        state->impl->can_schedule_workers() &&
+        !state->task->is_empty_work_queue() &&
+        state->remaining_workers.fetch_sub(1, std::memory_order_relaxed) > 0;
+
+    if (ABSL_PREDICT_TRUE(!schedule_worker)) {
+      break;
+    }
+
+    // Add +1 reference to account for the scheduled worker, as we use `impl`
+    // reference count to track the number of active workers.
+    state->impl->add_ref();
+    state->impl->threadpool_->Schedule(
+        [state = slinky::ref_count<schedule_state>(state).take()]() {
+          YnnThreadpool::impl::schedule_workers</*release_impl_ref=*/true>(
+              state);
+        });
+  }
+
+  // Keep processing tasks from the queue until we are out of tasks.
+  static constexpr bool kFalse = false;
+  state->impl->work_on_tasks(absl::Condition(&kFalse));
+
+  // One `impl` reference implicitly owned by the `state`, every additional
+  // reference is added and released explicitly by the worker task.
+  if constexpr (release_impl_ref) {
+    state->impl->release();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// YnnThreadpool
+//===----------------------------------------------------------------------===//
+
+YnnThreadpool::YnnThreadpool(Eigen::ThreadPoolDevice* device)
+    : impl_(new impl(device ? device->getPool() : nullptr)) {}
+
+YnnThreadpool::YnnThreadpool(Eigen::ThreadPoolInterface* threadpool)
+    : impl_(new impl(threadpool)) {}
+
+YnnThreadpool::~YnnThreadpool() = default;
+
+slinky::ref_count<YnnThreadpool::task> YnnThreadpool::enqueue(
+    size_t n, task_body t, int32_t max_workers) {
+  CHECK_GE(max_workers, n);
+
+  // Don't create more partitions than the number of threads. Also make sure
+  // that we have at least one partition (if we don't have a scheduler).
+  size_t num_partitions = std::min<size_t>(n, thread_count());
+  num_partitions = std::max<size_t>(1, num_partitions);
+
+  auto task = impl_->enqueue(std::move(t), n, num_partitions);
+
+  // If we don't have any worker threads, we return a task to the caller, and
+  // assume that the caller will wait on it.
+  if (ABSL_PREDICT_FALSE(impl_->thread_count() == 0)) {
+    return task;
+  }
+
+  // We assume that the caller will immediately start working on the task, so we
+  // need to schedule workers only for the remaining number of partitions.
+  impl_->schedule_workers(/*num_workers=*/num_partitions - 1, task);
+
+  return task;
+}
+
+void YnnThreadpool::wait_for(task* t) {
+  task_impl* task = static_cast<task_impl*>(t);
+  task_state state = impl_->work_on_task(task);
+
+  // If the task is complete or done, we are immediately done with waiting.
+  if (ABSL_PREDICT_TRUE(state == task_state::kComplete ||
+                        state == task_state::kDone)) {
+    return;
+  }
+
+  // Switch to the work stealing mode and work on other tasks in the queue
+  // until the given task is done.
+  impl_->work_on_tasks(absl::Condition(task, &task_impl::done));
+  impl_->await(absl::Condition(task, &task_impl::done));
+}
+
+void YnnThreadpool::wait_for(predicate_ref condition) {
+  impl_->work_on_tasks(absl::Condition(&condition));
+  impl_->await(absl::Condition(&condition));
+}
+
+void YnnThreadpool::atomic_call(slinky::function_ref<void()> t) {
+  impl_->atomic_call(t);
+}
+
+int YnnThreadpool::thread_count() const { return impl_->thread_count(); }
+
+}  // namespace xla::cpu
--- a/third_party/xla/xla/backends/cpu/ynn_threadpool.h
+++ b/third_party/xla/xla/backends/cpu/ynn_threadpool.h
@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_YNN_THREADPOOL_H_
+#define XLA_BACKENDS_CPU_YNN_THREADPOOL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "slinky/base/function_ref.h"
+#include "slinky/base/ref_count.h"
+#include "slinky/base/thread_pool.h"
+
+namespace Eigen {
+struct ThreadPoolDevice;
+class ThreadPoolInterface;
+}  // namespace Eigen
+
+namespace xla::cpu {
+
+// This is an implementation of slinky::thread_pool, using absl::Mutex for
+// synchronization, and dispatches work to Eigen::ThreadPoolInterface.
+class YnnThreadpool final : public slinky::thread_pool {
+ public:
+  explicit YnnThreadpool(Eigen::ThreadPoolDevice* device);
+  explicit YnnThreadpool(Eigen::ThreadPoolInterface* threadpool);
+  ~YnnThreadpool() final;
+
+  YnnThreadpool(YnnThreadpool&&) = delete;
+  YnnThreadpool& operator=(YnnThreadpool&&) = delete;
+
+  slinky::ref_count<task> enqueue(size_t n, task_body t,
+                                  int32_t max_workers) final;
+
+  void wait_for(task* t) final;
+  void wait_for(predicate_ref condition) final;
+
+  void atomic_call(slinky::function_ref<void()> t) final;
+
+  int thread_count() const final;
+
+ private:
+  class impl;
+  slinky::ref_count<impl> impl_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_YNN_THREADPOOL_H_
--- a/third_party/xla/xla/backends/cpu/ynn_threadpool_test.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_threadpool_test.cc
@ -0,0 +1,99 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/ynn_threadpool.h"
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace Eigen {
+class ThreadPoolInterface;
+}  // namespace Eigen
+
+namespace xla::cpu {
+
+TEST(YnnThreadpool, inline_scheduling) {
+  YnnThreadpool thread_pool(static_cast<Eigen::ThreadPoolInterface*>(nullptr));
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 1);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(YnnThreadpool, single_loop) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  YnnThreadpool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 1);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(YnnThreadpool, loop_chain) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  YnnThreadpool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 10000;
+
+  std::vector<int32_t> data(size, 0);
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+  thread_pool.parallel_for(size, inc);
+
+  std::vector<int32_t> expected(size, 5);
+  EXPECT_EQ(data, expected);
+}
+
+TEST(YnnThreadpool, nested_loops) {
+  tsl::thread::ThreadPool test_thread_pool(tsl::Env::Default(), "test", 4);
+  YnnThreadpool thread_pool(test_thread_pool.AsEigenThreadPool());
+
+  static constexpr size_t size = 100;
+
+  std::array<std::atomic<int32_t>, size> data = {{0}};
+  auto inc = [&](size_t i) { data[i]++; };
+
+  thread_pool.parallel_for(
+      size, [&](size_t i) { thread_pool.parallel_for(size, inc); });
+
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_EQ(data[i], size);
+  }
+}
+
+}  // namespace xla::cpu