c10/core/DispatchKey.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <c10/core/DeviceType.h>
 #include <c10/macros/Export.h>
 #include <cstdint>
 #include <ostream>
 #include <string>

 namespace c10 {

 // Semantically, each value of BackendComponent identifies a "backend" for our
 // dispatch. Some functionalities that we may dispatch to are allowed to
 // register different handlers for each backend. The BackendComponent is then
 // used to figure out which backend implementation to dispatch to.

 // In implementation terms, the backend component identifies a specific "bit" in
 // a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
 // ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
 // functionalities. When we encounter a functionality bit that is known to be
 // customizable per-backend, then we also look at the lower BackendComponent
 // bits and take the highest bit to determine which backend's implementation to
 // use.

 // WARNING!  If you add a new backend component to the end of this list,
 // make sure you register it before Meta.
 // Meta must be at the end so that meta key in tls triggers meta kernels.
 // (But you shouldn't: private use keys should have higher precedence than all
 // built-in keys)

 // If you add a new (non-privateuse) backend here,
 // make sure to add an Autograd<Backend> fallthrough kernel
 // in aten/src/ATen/core/VariableFallbackKernel.cpp

 #define C10_FORALL_BACKEND_COMPONENTS(_, extra) \
   _(CPU, extra)                                 \
   _(CUDA, extra)                                \
   _(HIP, extra)                                 \
   _(XLA, extra)                                 \
   _(MPS, extra)                                 \
   _(IPU, extra)                                 \
   _(XPU, extra)                                 \
   _(HPU, extra)                                 \
   _(VE, extra)                                  \
   _(Lazy, extra)                                \
   _(MTIA, extra)                                \
   _(PrivateUse1, extra)                         \
   _(PrivateUse2, extra)                         \
   _(PrivateUse3, extra)                         \
   _(Meta, extra)

 // WARNING!  If we add a new per-backend functionality key that has higher
 // priority than Autograd, then make sure you update EndOfRuntimeBackendKeys

 #define C10_FORALL_FUNCTIONALITY_KEYS(_) \
   _(Dense, )                             \
   _(Quantized, Quantized)                \
   _(Sparse, Sparse)                      \
   _(NestedTensor, NestedTensor)          \
   _(AutogradFunctionality, Autograd)

 enum class BackendComponent : uint8_t {

   // A "backend" is colloquially used to refer to handlers for dispatch
   // which actually implement the numerics of an operation in question.
   //
   // Due to the nature of the enum, these backends are specified in
   // an ordered way, but for most backends this order is not semantically
   // meaningful (e.g., it's valid to reorder these backends without changing
   // semantics).  The only situation when backend ordering is meaningful
   // is when the backend participates in multiple dispatch with another
   // backend; e.g., CPU and CUDA (cuda must have higher priority).

   // These keys don't correspond to individual kernels.
   // Instead, they represent the backends that are allowed to override specific
   // pieces of functionality:
   // - dense kernels (e.g. DispatchKey::CPU)
   // - sparse kernels (e.g. DispatchKey::SparseCPU)
   // - quantized kernels (e.g. DispatchKey::QuantizedCPU)
   // - autograd kernels (e.g. DispatchKey::AutogradCPU)
   // We reserve space in the runtime operator table for this full cross product
   // of
   // [backends in this enum] x [keys below that are explicitly marked as having
   // per-backend functionality]
   //
   // A meta tensor is a tensor without any data associated with it.  (They
   // have also colloquially been referred to as tensors on the "null" device).
   // A meta tensor can be used to dry run operators without actually doing any
   // computation, e.g., add on two meta tensors would give you another meta
   // tensor with the output shape and dtype, but wouldn't actually add anything.

   InvalidBit = 0,
 #define DEFINE_BACKEND_COMPONENT(n, _) n##Bit,
   C10_FORALL_BACKEND_COMPONENTS(DEFINE_BACKEND_COMPONENT, unused)
 #undef DEFINE_BACKEND_COMPONENT

   // Define an alias to represent end of backend dispatch keys.
   // If you add new backend keys after PrivateUse3, please also update it here.
   EndOfBackendKeys = MetaBit,
 };

 // Semantically, a dispatch key identifies a possible "level" in our
 // dispatch, for which a handler may be registered. Each handler corresponds
 // to a type of functionality.
 //
 // In implementation terms, the dispatch key identifies a specific "bit" in a
 // DispatchKeySet.  Higher bit indexes get handled by dispatching first (because
 // we "count leading zeros" when we extract the highest priority dispatch
 // key.)
 //
 // Note [DispatchKey Classification]
 // This enum actually contains several types of keys, which are explained
 // in more detail further down:
 // (1) non-customizable backends (e.g. FPGA)
 // (2) non-customizable functionalities (e.g. Functionalize)
 // (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
 // AutogradFunctionality) (4) per-backend instances of customizable
 // functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
 // CompositeImplicitAutograd)
 //
 // Of the categories above, it's important to note:
 // (a) which keys are assigned individual bits in a DispatchKeySet
 // (b) which keys are assigned individual slots in the runtime operator table
 // ("Runtime keys")
 //
 // (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
 // (1), (2) and (4) all get their own dedicated slots in the runtime operator
 // table.

 // See Note [DispatchKeySet Internal Representation] for more details.
 //
 // NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py
 enum class DispatchKey : uint16_t {

   // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   // This is not a "real" functionality, but it exists to give us a "nullopt"
   // element we can return for cases when a DispatchKeySet contains no elements.
   // You can think a more semantically accurate definition of DispatchKey is:
   //
   //    using DispatchKey = optional<RealDispatchKey>
   //
   // and Undefined == nullopt.  We didn't actually represent
   // it this way because optional<RealDispatchKey> would take two
   // words, when DispatchKey fits in eight bits.

   Undefined = 0,

   // Define an alias for Undefined to represent CatchAll (long term
   // this will get eliminated, but for now it's convenient)
   CatchAll = Undefined,

   // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
   // Every value in the enum (up to EndOfFunctionalityKeys)
   // corresponds to an individual "functionality" that can be dispatched to.
   // This is represented in the DispatchKeySet by assigning each of these enum
   // values
   // to each of the remaining (64 - len(BackendComponent)) bits.
   //
   // Most of these functionalities have a single handler assigned to them,
   // making them "runtime keys".
   // That map to a single slot in the runtime operator table.
   //
   // A few functionalities are allowed to be customizable per backend.
   // See [Note: Per-Backend Functionality Dispatch Keys] for details.

   // See [Note: Per-Backend Functionality Dispatch Keys]
   Dense,

   // Below are non-extensible backends.
   // These are backends that currently don't have their own overrides for
   // Autograd/Sparse/Quantized kernels,
   // and we therefore don't waste space in the runtime operator table allocating
   // space for them.
   // If any of these backends ever need to customize, e.g., Autograd, then we'll
   // need to add a DispatchKey::*Bit for them.

   // TODO: put this in BackendComponents
   FPGA, // Xilinx support lives out of tree at
   // https://gitlab.com/pytorch-complex/vitis_kernels

   // TODO: put this in BackendComponents
   // ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and
   // https://github.com/microsoft/onnxruntime, and is also used to test general
   // backend/extension machinery in the core. cf:
   // - test/cpp_extensions/ort_extension.cpp
   // - test/test_torch.py
   // - aten/src/ATen/test/extension_backend_test.cpp
   ORT,

   Vulkan, // TODO: put this in BackendComponents
   Metal, // TODO: put this in BackendComponents

   // See [Note: Per-Backend Functionality Dispatch Keys]
   Quantized,

   // This backend is to support custom RNGs; it lets you go
   // to a different kernel if you pass in a generator that is not a
   // traditional CPUGeneratorImpl/CUDAGeneratorImpl.  To make use of this
   // key:
   //  1) set it as a second parameter of at::Generator constructor call in
   //     the user-defined PRNG class.
   //  2) use it as a dispatch key while registering custom kernels
   //     (templatized kernels specialized for user-defined PRNG class)
   // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp
   CustomRNGKeyId,

   // TODO: Make Mkldnn a functionality key, so we can give it Meta
   // support
   // Here are backends which specify more specialized operators
   // based on the layout of the tensor.  Note that the sparse backends
   // are one case where ordering matters: sparse multi-dispatches with
   // the corresponding dense tensors, and must be handled before them.
   MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
   // NB: not to be confused with MKLDNN, which is Caffe2 only

   // See [Note: Per-Backend Functionality Dispatch Keys]
   Sparse,

   // TODO: Make SparseCsr a functionality key
   SparseCsrCPU,
   SparseCsrCUDA,

   NestedTensor,

   // In some situations, it is not immediately obvious what the correct
   // backend for function is, because the function in question doesn't
   // have any "tensor" arguments.  In this case, a BackendSelect function
   // can be registered to implement the custom determination of the
   // correct backend.
   BackendSelect,

   Python,

   // Out-of-core key for Fake Tensor in torchdistx.
   // See https://pytorch.org/torchdistx/latest/fake_tensor.html
   // TODO: delete this in favor of Python-implemented fake tensor
   Fake,
   // See Note [Out-of-tree vmap+grad prototype]. The purpose of this key
   // is to insert code after the "autograd subsystem" runs, so this key should
   // be directly after ADInplaceOrView and all of the autograd keys.
   FuncTorchDynamicLayerBackMode,

   // Alias and mutation removal.
   // If some backends want to opt into only alias removal or only mutation
   // removal,
   // we can consider adding separate keys dedicated to those individual passes.
   // See Note [Functionalization Pass In Core] for details.
   Functionalize,

   // The named dispatch key is set for any tensors with named dimensions.
   // Although we have a dispatch key for named tensors, for historical reasons,
   // this dispatch key doesn't do any of the substantive functionality for named
   // tensor (though, hypothetically, it could!)  At the moment, it's just
   // responsible for letting us give good error messages when operations
   // don't support named tensors.
   //
   // NB: If you ever consider moving named tensor functionality into
   // this dispatch key, note that it might be necessary add another dispatch
   // key that triggers before composite operators, in case a composite operator
   // has named dimension propagation that doesn't match that of its
   // constituent parts.
   // TODO: delete this once torchdim lands in functorch
   Named,

   // The Conjugate dispatch key is set for any tensors that need to perform
   // conjugation
   // This is implemented at a dispatch level right before any backends run
   Conjugate,

   // The Negative dispatch key is set for any tensors that need to perform
   // negation
   // This is implemented at a dispatch level right before any backends run
   Negative,

   ZeroTensor, // registered at build/aten/src/ATen/RegisterZeroTensor.cpp

   // Note [ADInplaceOrView key]
   // ADInplaceOrView key is used by inplace or view ops to register a kernel
   // that does additional setup for future autograd computation.
   //
   // 1. For inplace ops this kernel does version bump
   // 2. For view ops this kernel does `as_view` setup where we properly setup
   //    DifferentiableViewMeta on the view tensors.
   //
   // For other ops it's fallthrough kernel since there's no extra
   // work to do.
   //
   // Note [Dream: skip VariableType kernel when requires_grad=false]
   //
   // In an ideal world where we can skip VariableType kernel for inputs
   // with requires_grad=false, instead of a fallthrough kernel, we'll
   // register a kernel shown below to all functional ops as well:
   // torch::Tensor my_functional_op(...) {
   //   {
   //     // Note for every op in VariableType, you need to go through
   //     // `AutoDispatchBelowADInplaceOrView` guard exactly once to add the
   //     // key to TLS excluded set. If you don't go through it at all,
   //     // inplace/view ops called through `at::` inside your backend
   //     // kernel will dispatch to ADInplaceOrView kernels and do a lot
   //     // of extra work.
   //     at::AutoDispatchBelowADInplaceOrView guard;
   //     at::redispatch::my_functional_op(...);
   //   }
   // }
   // But this work is currently blocked since it adds an extra dispatch
   // for all ops and it's non-trivial overhead at model level(a few percents).
   // Thus our current approach takes advantage of the fact every kernel go
   // through VariableType kernel first and pulls the
   // `at::AutoDispatchBelowADInplaceOrView` guard of functional ops
   // up to the `VariableType` kernel. Thus we only add the extra dispatch
   // to view/inplace ops to minimize its perf impact to real models.
   ADInplaceOrView,
   // Note [Alias Dispatch Key : Autograd]
   // All backends are oblivious to autograd; autograd is handled as a
   // layer which happens on top of all backends. It inspects the autograd
   // metadata of all inputs, determines what autograd metadata should be
   // constructed by the output, and otherwise defers to the backend to
   // actually do the numeric computation.  Autograd contains
   // the bulk of this logic.

   // Autograd is now an alias dispatch key which by default maps to all
   // backend-specific autograd keys.
   // Backend-specific allow backends to override the default kernel registered
   // to Autograd key as needed.
   // For example, XLA wants to define autograd for einsum directly.
   // Registering a custom autograd implementation at the XLA key won't work
   // because we process Autograd before XLA.  This key has higher priority and
   // gets processed first.  You generally should NOT redispatch after handling
   // autograd here (since that would result in execution of the Autograd
   // operator, which you're trying to skip).  In AutogradXLA implementations,
   // you are responsible for handling autograd yourself, or deferring to other
   // operators which support autograd.

   // Currently we only have backend-specific autograd keys for CPU/CUDA/XLA and
   // reserved user-defined backends. All other in-tree backends share the
   // AutogradOther key. We can add specific autograd key for those backends
   // upon request.
   AutogradOther,

   // See [Note: Per-Backend Functionality Dispatch Keys]
   AutogradFunctionality,

   // NestedTensor is an example of something that isn't a "real backend"
   // (because it mostly consists of redispatching kernels)
   // but it would like to override autograd functionality in C++.
   // We can handle cases like this by adding an extra functionality key
   // exclusively for handling autograd for NestedTensor.
   // lives out of tree at
   // https://github.com/pytorch/nestedtensor
   AutogradNestedTensor,

   Tracer,

   // TODO: make Autocast a functionality key
   // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed
   // and inputs are saved for backward in the post-autocast type.
   AutocastCPU,
   AutocastXPU,
   AutocastIPU,
   AutocastHPU,
   AutocastXLA,
   // AutocastXLA is only being used for TPUs. XLA GPUs continue to use
   // AutocastCUDA.
   AutocastCUDA,
   AutocastPrivateUse1,

   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   // There are a number of alternative modes which may want to handle before
   // autograd; for example, error checking, tracing, profiling or vmap.  They
   // go here.

   FuncTorchBatched, // See Note [Out-of-tree vmap+grad prototype]

   // Dispatch key for BatchedTensorImpl wrapping a nested tensor.
   BatchedNestedTensor,

   FuncTorchVmapMode, // See Note [Out-of-tree vmap+grad prototype]

   // This is the dispatch key for BatchedTensorImpl, which is used to implement
   // batching rules for vmap.
   Batched,

   // When we are inside a vmap, all tensors dispatch on this key.
   // See Note: [DispatchKey::VmapMode usage] for more details.
   VmapMode,

   FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype]

   // Out-of-core key for Deferred Module Initialization in torchdistx.
   // See https://pytorch.org/torchdistx/latest/deferred_init.html
   DeferredInit,

   // Used by Python key logic to know the set of tls on entry to the dispatcher
   // This kernel assumes it is the top-most non-functorch-related DispatchKey.
   // If you add a key above, make sure to update the fallback implementation for
   // this.
   PythonTLSSnapshot,

   // This key should be at the very top of the dispatcher
   FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype]

   // TESTING: This is intended to be a generic testing tensor type id.
   // Don't use it for anything real; its only acceptable use is within a single
   // process test.  Use it by creating a TensorImpl with this DispatchKey, and
   // then registering operators to operate on this type id.  See
   // aten/src/ATen/core/dispatch/backend_fallback_test.cpp for a usage example.
   TESTING_ONLY_GenericWrapper,

   // TESTING: This is intended to be a generic testing tensor type id.
   // Don't use it for anything real; its only acceptable use is within a ingle
   // process test.  Use it by toggling the mode on and off via
   // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators
   // to operate on this type id.  See
   // aten/src/ATen/core/dispatch/backend_fallback_test.cpp
   // for a usage example
   TESTING_ONLY_GenericMode,

   // This key is used for pre-dispatch tracing in make_fx.
   // It has lower priority than the PythonDispatcher key
   // because we use the PythonDispatcher to intercept the key from python,
   // and avoid having to implement it in C++.
   PreDispatch,

   // This is a bypass that allows you to skip running the C++ dispatcher
   // entirely
   PythonDispatcher,

   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   EndOfFunctionalityKeys, // End of functionality keys.

 // ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ //
 // Here are backends which you think of as traditionally specifying
 // how to implement operations on some device.

 #define DEFINE_PER_BACKEND_KEYS_FOR_BACKEND(n, prefix) prefix##n,

 #define DEFINE_PER_BACKEND_KEYS(fullname, prefix)      \
   StartOf##fullname##Backends,                         \
       C10_FORALL_BACKEND_COMPONENTS(                   \
           DEFINE_PER_BACKEND_KEYS_FOR_BACKEND, prefix) \
           EndOf##fullname##Backends = prefix##Meta,

   C10_FORALL_FUNCTIONALITY_KEYS(DEFINE_PER_BACKEND_KEYS)

 #undef DEFINE_PER_BACKEND_KEYS
 #undef DEFINE_PER_BACKEND_KEYS_FOR_BACKEND

       EndOfRuntimeBackendKeys = EndOfAutogradFunctionalityBackends,

   // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   // Note [Alias Dispatch Keys]
   // Alias dispatch keys are synthetic dispatch keys which map to multiple
   // runtime dispatch keys. Alisa keys have precedence, but they are always
   // lower precedence than runtime keys. You can register a kernel to an
   // alias key, the kernel might be populated to the mapped runtime keys
   // during dispatch table computation.
   // If a runtime dispatch key has multiple kernels from alias keys, which
   // kernel wins is done based on the precedence of alias keys (but runtime
   // keys always have precedence over alias keys).
   // Alias keys won't be directly called during runtime.

   // See Note [Alias Dispatch Key : Autograd]
   Autograd,
   CompositeImplicitAutograd, // registered at
   // build/aten/src/ATen/RegisterCompositeImplicitAutograd.cpp

   // Note: The alias keyset for FuncTorchBatchedDecomposition is disjoint from
   // all
   // other alias keysets
   // and so precedence order doesn't matter
   FuncTorchBatchedDecomposition, // registered at
   // build/aten/src/ATen/RegisterFuncTorchBatchedDecomposition.cpp
   // Note: The alias keyset for CompositeImplicitAutogradNestedTensor is
   // disjoint from all other alias keysets
   CompositeImplicitAutogradNestedTensor, // registered at
   // build/aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp
   CompositeExplicitAutograd, // registered at
   // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp
   // See Note [CompositeExplicitAutogradNonFunctional Key]
   CompositeExplicitAutogradNonFunctional, // registered at
   // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp

   // Define an alias key to represent end of alias dispatch keys.
   // If you add new alias keys after Autograd, please also update it here.
   StartOfAliasKeys = Autograd,
   EndOfAliasKeys = CompositeExplicitAutogradNonFunctional, //

   // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   // The aliases exist for backwards compatibility reasons, they shouldn't
   // be used
   CPUTensorId = CPU,
   CUDATensorId = CUDA,
   DefaultBackend = CompositeExplicitAutograd,
   PrivateUse1_PreAutograd = AutogradPrivateUse1,
   PrivateUse2_PreAutograd = AutogradPrivateUse2,
   PrivateUse3_PreAutograd = AutogradPrivateUse3,
   Autocast = AutocastCUDA,
 };

 // Note [Private use DispatchKey]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Private use tensor IDs are preallocated tensor type IDs for use in user
 // applications.  Similar to private use fields in HTTP, they can be used
 // by end users for experimental or private applications, without needing
 // to "standardize" the tensor ID (which would be done by submitting a PR
 // to PyTorch to add your type ID).
 //
 // Private use tensor IDs are appropriate to use if you want to experiment
 // with adding a new tensor type (without having to patch PyTorch first) or
 // have a private, non-distributed application that needs to make use of a
 // new tensor type.  Private use tensor IDs are NOT appropriate to use for
 // libraries intended to be distributed to further users: please contact
 // the PyTorch developers to get a type ID registered in this case.
 //
 // We provide two classes of private user tensor id: regular DispatchKeys
 // and Autograd DispatchKeys.  DispatchKeys serve the role of ordinary "backend"
 // DispatchKeys; if you were adding support for a new type of accelerator, you
 // would use a backend DispatchKey, and ideally automatically reuse
 // AutogradOther definitions already defined in PyTorch.  AutogradPrivateUse
 // DispatchKeys serve as "wrapper" DispatchKeys: they are only necessary for
 // tensors that compose multiple internal tensors, and for cases when the
 // built-in autograd formulas for operators are not appropriate.

 static_assert(
     (static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) +
      static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys)) <= 64,
     "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)"
     " both map to backend and functionality bits"
     " into a 64-bit bitmask; you must have less than 64 total entries between them");

 // Check if a DispatchKey is an alias mapping to other runtime keys.
 constexpr bool isAliasDispatchKey(DispatchKey k) {
   return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys;
 }

 // [Note: Per-Backend Functionality Dispatch Keys]
 // Check if a DispatchKey is a per-backend functionality key
 // Any functionalities that can be customized per-backend should be added here.
 // These keys correspond to functionalities that can be customized individually
 // per backend. While they only take up one bit in the `DispatchKeySet` bitset,
 // they map to (# backends) slots in the operator table.
 // Each of these keys also has a separate set of "runtime keys" in the dispatch
 // key enum, per backend, which *do* map to the individual operator table slots.
 // For example, the "Sparse" key maps to an individual bit in the
 // DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual
 // slots in the runtime operator table.

 constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
   if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
       k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality ||
       k == DispatchKey::NestedTensor) {
     return true;
   } else {
     return false;
   }
 }

 // Note that this includes Undefined in the total count.
 // BUT EndOfFunctionalityKeys is its own (placeholder) key.
 // e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3.
 // In the above example, there are 3 total functionality keys.
 constexpr uint8_t num_functionality_keys =
     static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys);

 constexpr uint8_t num_backends =
     static_cast<uint8_t>(BackendComponent::EndOfBackendKeys);

 // Note [No More Than 16 Backends]
 // Search for this note to find places in the code where the "no more than 16
 // backends" invariant is baked in.
 static_assert(
     static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) <= 16,
     "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \
 there are a few places where this invariant is baked in");

 constexpr uint8_t numPerBackendFunctionalityKeys() {
   uint8_t count = 0;
   for (uint8_t k = 0; k <= num_functionality_keys; ++k) {
     if (isPerBackendFunctionalityKey(static_cast<DispatchKey>(k)))
       ++count;
   }
   return count;
 }

 #if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
 // See [Note: Trimmed Mobile Dispatch Keys]
 constexpr uint16_t num_runtime_entries = 8;
 #else
 constexpr uint16_t num_runtime_entries = num_functionality_keys +
     (numPerBackendFunctionalityKeys() * (num_backends - 1));
 #endif

 // See Note [No More Than 16 Backends]
 constexpr uint16_t full_backend_mask =
     (static_cast<uint16_t>(1) << num_backends) - 1;

 C10_API const char* toString(DispatchKey);
 C10_API const char* toString(BackendComponent);
 C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
 C10_API std::ostream& operator<<(std::ostream&, BackendComponent);

 C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);

 // Parses a string into a dispatch key.
 // If the string cannot be correctly parsed, throws an exception.
 C10_API c10::DispatchKey parseDispatchKey(const std::string& k);

 // These are some convenience identifiers for dispatch keys which are
 // shorter to type than their long counterparts.  Note that some of these
 // dispatch keys directly correspond to DeviceType; and most APIs that
 // accept DispatchKey also accept DeviceType; e.g.,
 // torch::dispatch(torch::kCPU, ...) is also valid.
 constexpr DispatchKey kAutograd = DispatchKey::Autograd;

 // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
 // This function relies on the invariant that the dispatch keys between
 // StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
 // in the same order as `BackendComponent`.
 constexpr BackendComponent toBackendComponent(DispatchKey k) {
   if (k >= DispatchKey::StartOfDenseBackends &&
       k <= DispatchKey::EndOfDenseBackends) {
     return static_cast<BackendComponent>(
         static_cast<uint8_t>(k) -
         static_cast<uint8_t>(DispatchKey::StartOfDenseBackends));
   } else if (
       k >= DispatchKey::StartOfQuantizedBackends &&
       k <= DispatchKey::EndOfQuantizedBackends) {
     return static_cast<BackendComponent>(
         static_cast<uint8_t>(k) -
         static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends));
   } else if (
       k >= DispatchKey::StartOfSparseBackends &&
       k <= DispatchKey::EndOfSparseBackends) {
     return static_cast<BackendComponent>(
         static_cast<uint8_t>(k) -
         static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
   } else if (
       k >= DispatchKey::StartOfNestedTensorBackends &&
       k <= DispatchKey::EndOfNestedTensorBackends) {
     return static_cast<BackendComponent>(
         static_cast<uint8_t>(k) -
         static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends));
   } else if (
       k >= DispatchKey::StartOfAutogradFunctionalityBackends &&
       k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
     return static_cast<BackendComponent>(
         static_cast<uint8_t>(k) -
         static_cast<uint8_t>(
             DispatchKey::StartOfAutogradFunctionalityBackends));
   } else {
     return BackendComponent::InvalidBit;
   }
 }

 constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
   if (k <= DispatchKey::EndOfFunctionalityKeys) {
     return k;
   } else if (k <= DispatchKey::EndOfDenseBackends) {
     return DispatchKey::Dense;
   } else if (k <= DispatchKey::EndOfQuantizedBackends) {
     return DispatchKey::Quantized;
   } else if (k <= DispatchKey::EndOfSparseBackends) {
     return DispatchKey::Sparse;
   } else if (k <= DispatchKey::EndOfNestedTensorBackends) {
     return DispatchKey::NestedTensor;
   } else if (k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
     return DispatchKey::AutogradFunctionality;
   } else {
     return DispatchKey::Undefined;
   }
 }

 BackendComponent toBackendComponent(DeviceType device_type);

 // Given (DispatchKey::Dense, BackendComponent::CUDABit), returns
 // DispatchKey::CUDA.
 // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
 // This function relies on the invariant that the dispatch keys between
 // StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
 // in the same order as `BackendComponent`.
 constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
     DispatchKey functionality_k,
     BackendComponent backend_k) {
   if (functionality_k == DispatchKey::Dense) {
     return static_cast<DispatchKey>(
         static_cast<uint8_t>(DispatchKey::StartOfDenseBackends) +
         static_cast<uint8_t>(backend_k));
   }
   if (functionality_k == DispatchKey::Sparse) {
     return static_cast<DispatchKey>(
         static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
         static_cast<uint8_t>(backend_k));
   }
   if (functionality_k == DispatchKey::Quantized) {
     return static_cast<DispatchKey>(
         static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
         static_cast<uint8_t>(backend_k));
   }
   if (functionality_k == DispatchKey::NestedTensor) {
     return static_cast<DispatchKey>(
         static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends) +
         static_cast<uint8_t>(backend_k));
   }
   if (functionality_k == DispatchKey::AutogradFunctionality) {
     return static_cast<DispatchKey>(
         static_cast<uint8_t>(
             DispatchKey::StartOfAutogradFunctionalityBackends) +
         static_cast<uint8_t>(backend_k));
   }
   return DispatchKey::Undefined;
 }

 } // namespace c10

 namespace torch {
 // Expose the constant, but not the TYPE (DispatchKey is an implementation
 // detail!)
 using c10::kAutograd;
 } // namespace torch

 // NB: You really shouldn't use this instance; this enum is guaranteed
 // to be pretty small so a regular array should be acceptable.
 namespace std {
 template <>
 struct hash<c10::DispatchKey> {
   typedef size_t result_type;
   typedef c10::DispatchKey argument_type;

   size_t operator()(c10::DispatchKey x) const {
     return static_cast<size_t>(x);
   }
 };
 } // namespace std