pytorch/c10/core/DispatchKey.h

#pragma once

#include <iostream>
#include <string>
#include <c10/macros/Macros.h>

namespace c10 {

// Semantically, a dispatch key identifies a possible "level" in our
// dispatch, for which a handler may be registered.  Traditional
// backends like CPU and CUDA get dispatch keys; however, so do
// "wrapping" layers like Variable (for autograd handling).
//
// In implementation terms, the dispatch key identifies a specific "bit" in a
// DispatchKeySet.  Higher bit indexes get handled by dispatching first (because
// we "count leading zeros" when we extract the highest priority dispatch
// key.)
enum class DispatchKey : uint8_t {

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
  // This is not a "real" tensor id, but it exists to give us a "nullopt"
  // element we can return for cases when a DispatchKeySet contains no elements.
  // You can think a more semantically accurate definition of DispatchKey is:
  //
  //    using DispatchKey = optional<RealDispatchKey>
  //
  // and Undefined == nullopt.  We didn't actually represent
  // it this way because optional<RealDispatchKey> would take two
  // words, when DispatchKey fits in eight bits.

  Undefined = 0,

  // Define an alias for Undefined to represent CatchAll (long term
  // this will get eliminated, but for now it's convenient)
  CatchAll = Undefined,

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
  // A "backend" is colloquially used to refer to handlers for dispatch
  // which actually implement the numerics of an operation in question.
  //
  // Due to the nature of the enum, these backends are specified in
  // an ordered way, but for most backends this order is not semantically
  // meaningful (e.g., it's valid to reorder these backends without changing
  // semantics).  The only situation when backend ordering is meaningful
  // is when the backend participates in multiple dispatch with another
  // backend; e.g., CPU and SparseCPU (sparse must have
  // higher priority).

  // Here are backends which you think of as traditionally specifying
  // how to implement operations on some device.
  CPU, // registered at build/aten/src/ATen/CPUType.cpp
  CUDA, // registered at build/aten/src/ATen/CUDAType.cpp
  HIP, // NB: I think this is not actually used, due to Note [Masquerading as
       // CUDA]
  FPGA, // Xilinx support lives out of tree at https://gitlab.com/pytorch-complex/vitis_kernels
  MSNPU, // unused externally, but tested at
         // test/cpp_extensions/msnpu_extension.cpp
  XLA, // lives out of tree at https://github.com/pytorch/xla
  Vulkan,

  // These are Caffe2 device types which we grandfathered into
  // DispatchKey.
  // TODO: Caffe2-only DispatchKeys actually should be removed from this enum
  // and just simply be undispatchable.
  MKLDNN, // (MKLDNN is treated as another "device" in Caffe2)
  OpenGL,
  OpenCL,
  IDEEP,

  // Here are backends which specify more specialized operators
  // based on the dtype of the tensor.
  QuantizedCPU, // registered at build/aten/src/ATen/QuantizedCPUType.cpp
  QuantizedCUDA, // registered at build/aten/src/ATen/QuantizedCUDAType.cpp
  ComplexCPU, // lives out of tree at
              // https://gitlab.com/pytorch-complex/pytorch-cpu-strided-complex
  ComplexCUDA, // and
               // https://gitlab.com/pytorch-complex/pytorch-cuda-strided-complex
  // tested at test/cpp_extensions/complex_registration_extension.cpp
  // TODO: Remove Complex dispatch keys when Complex is moved in tree

  // This backend is to support custom RNGs; it lets you go
  // to a different kernel if you pass in a generator that is not a
  // traditional CPUGeneratorImpl/CUDAGeneratorImpl.  To make use of this
  // key:
  //  1) set it as a second parameter of at::Generator constructor call in
  //     the user-defined PRNG class.
  //  2) use it as a dispatch key while registering custom kernels
  //     (templatized kernels specialized for user-defined PRNG class)
  // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp
  CustomRNGKeyId,

  // Here are backends which specify more specialized operators
  // based on the layout of the tensor.  Note that the sparse backends
  // are one case where ordering matters: sparse multi-dispatches with
  // the corresponding dense tensors, and must be handled before them.
  MkldnnCPU, // registered at build/aten/src/ATen/MkldnnCPUType.cpp
  // NB: not to be confused with MKLDNN, which is Caffe2 only
  SparseCPU, // registered at build/aten/src/ATen/SparseCPUType.cpp
  SparseCUDA, // registered at build/aten/src/ATen/SparseCUDAType.cpp
  SparseHIP, // TODO: I think this is not actually used, due to Note
             // [Masquerading as CUDA]

  // Here are reserved backends for user-defined backends, see Note [Private use
  // DispatchKey]
  // To see some example about how to use this, check out MSNPU
  PrivateUse1,
  PrivateUse2,
  PrivateUse3,

  // The meta function characterizes how an operation affects the metadata of a
  // tensor (shape, dtype) without doing any of the actual computation.  A
  // meta tensor can be used to dry run operators without actually doing
  // any computation, e.g., add on two meta tensors would give you another
  // meta tensor with the output shape and dtype, but wouldn't actually
  // add anything.  A meta implementation typically would look something like:
  //
  //  Tensor meta::add(const Tensor& self, const Tensor& other) {
  //    TORCH_CHECK(self.size().equals(other.size()));
  //    return at::empty_like(self, self.size());
  //  }
  //
  // The meta function would get invoked if you ran an operator passing
  // in meta tensors.  The call stack in such a case would look something like
  // this:
  //
  //  at::add(x: Meta, y: Meta) {
  //    return [dispatch] meta::add(x: Meta, y: Meta) {
  //      output_shape = ...
  //      [dispatch] meta::empty(output_shape) {
  //        return ... meta tensor with output_shape but no data allocated ...
  //      }
  //    }
  //  }
  //
  // Meta functions have an important secondary function, which is they can
  // be used as tensor "allocators".  A typical backend implementation should
  // be implemented in this way:
  //
  //  Tensor cpu::add(const Tensor& self, const Tensor& other) {
  //    Tensor result = meta::add(self, other);
  //    // ... do the actual computation into result ...
  //    return result;
  //  }
  //
  // In this case, the internal at::empty_like invocation would dispatch to the
  // CPU factory function, not the meta factory function.  The call stack in
  // this case looks like:
  //
  //  at::add(x: CPU, y: CPU) {
  //    return [dispatch] cpu::add(x: CPU, y: CPU) {
  //      output = [direct] meta::add(x: CPU, y: CPU) {
  //        output_shape = ...
  //        [dispatch] cpu::empty(output_shape)
  //      }
  //      ... compute on output ...
  //      return output;
  //    }
  //  }
  //
  Meta,

  // In some situations, it is not immediately obvious what the correct
  // backend for function is, because the function in question doesn't
  // have any "tensor" arguments.  In this case, a BackendSelect function
  // can be registered to implement the custom determination of the
  // correct backend.
  BackendSelect,

  // The named dispatch key is set for any tensors with named dimensions.
  // Although we have a dispatch key for named tensors, for historical reasons,
  // this dispatch key doesn't do any of the substantive functionality for named
  // tensor (though, hypothetically, it could!)  At the moment, it's just
  // responsible for letting us give good error messages when operations
  // don't support named tensors.
  //
  // NB: If you ever consider moving named tensor functionality into
  // this dispatch key, note that it might be necessary add another dispatch
  // key that triggers before composite operators, in case a composite operator
  // has named dimension propagation that doesn't match that of its
  // constituent parts.
  Named,

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AUTOGRAD ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
  // All backends are oblivious to autograd; autograd is handled as a
  // layer which happens on top of all backends.  It inspects the autograd
  // metadata of all inputs, determines what autograd metadata should be
  // constructed by the output, and otherwise defers to the backend to
  // actually do the numeric computation.  Autograd contains
  // the bulk of this logic.
  Autograd,

  Profiler,

  Tracer,

  // Pre-autograd dispatch keys allow backends to override the autograd behavior
  // (aka Autograd) for operators which have a Variable kernel
  // already registered.  For example, XLA wants to define autograd for
  // einsum directly.  Registering a custom autograd implementation at the
  // XLA key won't work because we process Autograd
  // before XLA.  This key has higher priority and gets processed
  // first.  You generally should NOT redispatch after handling autograd
  // here (since that would result in execution of the Autograd
  // operator, which you're trying to skip).  In PreAutograd implementations,
  // you are responsible for handling autograd yourself, or deferring to other
  // operators which support autograd.
  XLAPreAutograd,

  // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed
  // and inputs are saved for backward in the post-autocast type.
  Autocast,

  // Here are some reserved pre-autograd keys for user-defined backends, see
  // Note [Private use DispatchKey]
  PrivateUse1_PreAutograd,
  PrivateUse2_PreAutograd,
  PrivateUse3_PreAutograd,

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
  // There are a number of alternative modes which may want to handle before
  // autograd; for example, error checking, tracing, profiling or vmap.  They
  // go here.

  // This is the dispatch key for BatchedTensorImpl, which is used to implement
  // batching rules for vmap.
  Batched,

  // TESTING: This is intended to be a generic testing tensor type id.
  // Don't use it for anything real; its only acceptable use is within a single
  // process test.  Use it by creating a TensorImpl with this DispatchKey, and
  // then registering operators to operate on this type id.  See
  // aten/src/ATen/core/dispatch/backend_fallback_test.cpp for a usage example.
  TESTING_ONLY_GenericWrapper,

  // TESTING: This is intended to be a generic testing tensor type id.
  // Don't use it for anything real; its only acceptable use is within a ingle
  // process test.  Use it by toggling the mode on and off via
  // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators
  // to operate on this type id.  See
  // aten/src/ATen/core/dispatch/backend_fallback_test.cpp
  // for a usage example
  TESTING_ONLY_GenericMode,

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
  NumDispatchKeys, // Sentinel

  // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
  // The aliases exist for backwards compatibility reasons, they shouldn't
  // be used
  CPUTensorId = CPU,
  CUDATensorId = CUDA,
};

// Note [Private use DispatchKey]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Private use tensor IDs are preallocated tensor type IDs for use in user
// applications.  Similar to private use fields in HTTP, they can be used
// by end users for experimental or private applications, without needing
// to "standardize" the tensor ID (which would be done by submitting a PR
// to PyTorch to add your type ID).
//
// Private use tensor IDs are appropriate to use if you want to experiment
// with adding a new tensor type (without having to patch PyTorch first) or
// have a private, non-distributed application that needs to make use of a
// new tensor type.  Private use tensor IDs are NOT appropriate to use for
// libraries intended to be distributed to further users: please contact
// the PyTorch developers to get a type ID registered in this case.
//
// We provide two classes of private user tensor id: regular DispatchKeys
// and PreAutograd DispatchKeys.  DispatchKeys serve the role of ordinary "backend"
// DispatchKeys; if you were adding support for a new type of accelerator, you
// would use a DispatchKey, and reuse autograd definitions already defined in
// PyTorch for operators you define.  PreAutograd DispatchKeys serve as "wrapper"
// DispatchKeys: they are most appropriate for tensors that compose multiple
// internal tensors, and for cases when the built-in autograd formulas for
// operators are not appropriate.

static_assert(
  static_cast<uint8_t>(DispatchKey::NumDispatchKeys) < 64,
  "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries");

C10_API const char* toString(DispatchKey);
C10_API std::ostream& operator<<(std::ostream&, DispatchKey);

// These are some convenience identifiers for dispatch keys which are
// shorter to type than their long counterparts.  Note that some of these
// dispatch keys directly correspond to DeviceType; and most APIs that
// accept DispatchKey also accept DeviceType; e.g.,
// torch::dispatch(torch::kCPU, ...) is also valid.
constexpr DispatchKey kAutograd = DispatchKey::Autograd;

} // namespace c10

namespace torch {
  // Expose the constant, but not the TYPE (DispatchKey is an implementation
  // detail!)
  using c10::kAutograd;
}

// NB: You really shouldn't use this instance; this enum is guaranteed
// to be pretty small so a regular array should be acceptable.
namespace std {
template <>
struct hash<c10::DispatchKey> {
  typedef size_t result_type;
  typedef c10::DispatchKey argument_type;

  size_t operator()(c10::DispatchKey x) const {
    return static_cast<size_t>(x);
  }
};
}