c10/core/DispatchKeySet.cpp - platform/external/pytorch - Git at Google

 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/irange.h>

 namespace c10 {

 // backend_dispatch_keyset includes all dispatch keys that map to backends.
 // Alias key DispatchKey::CompositeExplicitAutograd maps to
 // backend_dispatch_keyset
 constexpr DispatchKeySet backend_dispatch_keyset =
     autogradother_backends | DispatchKeySet(DispatchKey::Dense);

 // See Note [CompositeExplicitAutogradNonFunctional Key]
 // We have several types of decompositions in aten, that each have their own
 // alias key. You should register your decomposition to the
 // `CompositeExplicitAutogradNonFunctional key` if: (1) It's an out-of-place op
 // (2) It decomposes into one more mutation ops
 // (3) It has a derivative formula
 //     (In theory we could also have a separate key for
 //     "CompositeImplicitAutogradNonFunctional", but there isn't much of a use
 //     case for it currently).
 // This key is important for "functional" backends like LazyTensor / XLA.
 // If you're a backend that only expects to deal with "functional ops",
 // then you don't want to decompose a functional op into an op that causes
 // aliasing. You should just directly write a kernel for that functional op
 // instead!
 constexpr DispatchKeySet non_functional_backend_dispatch_keyset =
     backend_dispatch_keyset
         // XLA and LazyTensor are currently the only 2 backends in core
         // that use functionalization pass in eager mode.
         .remove(DispatchKey::Sparse)
         .remove_backend(BackendComponent::XLABit)
         .remove_backend(BackendComponent::LazyBit);

 bool isBackendDispatchKey(DispatchKey t) {
   return t != DispatchKey::Undefined
       // See Note [No Alias Keys in DispatchKeySet]
       && !isAliasDispatchKey(t)
       // Note [NestedTensor Not Included in Backend Keys]
       // NestedTensor has been explicitly removed from the "backend keyset" due
       // to incompatibility with some kernels, so we don't want it to be
       // included in CompositeExplicitAutograd kernels.
       && t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t);
 }

 // math_dispatch_keyset contains all keys in backend_dispatch_keyset and
 // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd
 // maps to [math_dispatch_keyset x full_backend_mask]
 constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
     autograd_dispatch_keyset |
     // See Note [NestedTensor Not Included in Backend Keys]
     // The caveat to that note is that nested_tensor is a special case
     // where we would like to support composite implicit kernels but not
     // explicit kernels therefore we manually add the key to the
     // math_dispatch_keyset
     DispatchKeySet{DispatchKey::NestedTensor} |
     // Functionalize should always re-use CompositeImplicit decomps.
     DispatchKeySet{DispatchKey::Functionalize};

 constexpr DispatchKeySet nested_dispatch_keyset =
     DispatchKeySet(
         {DispatchKey::AutogradNestedTensor, DispatchKey::NestedTensor}) |
     DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);

 DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
   TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
   switch (t) {
     case DispatchKey::Autograd:
       // See Note [autograd_dispatch_keyset Does Not Include Backend Bits]
       // That's why we OR it with a mask of the backend bits here.
       // getRuntimeDispatchKeySet() expects to return a keyset of runtime
       // dispatch keys, like AutogradCPU, but that requires having backend bits.
       return autograd_dispatch_keyset |
           DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
     case DispatchKey::CompositeImplicitAutograd:
       return math_dispatch_keyset;
     case DispatchKey::CompositeImplicitAutogradNestedTensor:
       return nested_dispatch_keyset;
     case DispatchKey::CompositeExplicitAutograd:
       return backend_dispatch_keyset;
     case DispatchKey::CompositeExplicitAutogradNonFunctional:
       return non_functional_backend_dispatch_keyset;
     default:
       return DispatchKeySet(t);
   }
 }

 bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) {
   TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
   switch (t) {
     case DispatchKey::Autograd:
       return autograd_dispatch_keyset.has(toFunctionalityKey(k));
     case DispatchKey::CompositeImplicitAutograd:
       // See Note [NestedTensor Not Included in Backend Keys]
       return math_dispatch_keyset.has(k);
     case DispatchKey::CompositeImplicitAutogradNestedTensor:
       // See Note [NestedTensor Not Included in Backend Keys]
       return nested_dispatch_keyset.has(k);
     case DispatchKey::CompositeExplicitAutograd:
       // See Note [NestedTensor Not Included in Backend Keys]
       return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k);
     case DispatchKey::CompositeExplicitAutogradNonFunctional:
       // See Note [NestedTensor Not Included in Backend Keys]
       return k != DispatchKey::NestedTensor &&
           non_functional_backend_dispatch_keyset.has(k);
     case DispatchKey::FuncTorchBatchedDecomposition:
       return functorch_batched_ks.has(k);
     default:
       return t == k;
   }
 }

 // for a given autograd key, return the (guaranteed nonempty) set of associated
 // backend keys. for a non-autograd key, return the empty keyset.
 DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
   switch (t) {
     case DispatchKey::AutogradCPU:
       return DispatchKeySet(DispatchKey::CPU);
     case DispatchKey::AutogradCUDA:
       return DispatchKeySet(DispatchKey::CUDA);
     case DispatchKey::AutogradXLA:
       return DispatchKeySet(DispatchKey::XLA);
     case DispatchKey::AutogradLazy:
       return DispatchKeySet(DispatchKey::Lazy);
     case DispatchKey::AutogradMeta:
       return DispatchKeySet(DispatchKey::Meta);
     case DispatchKey::AutogradMPS:
       return DispatchKeySet(DispatchKey::MPS);
     case DispatchKey::AutogradHPU:
       return DispatchKeySet(DispatchKey::HPU);
     case DispatchKey::AutogradIPU:
       return DispatchKeySet(DispatchKey::IPU);
     case DispatchKey::AutogradXPU:
       return DispatchKeySet(DispatchKey::XPU);
     case DispatchKey::AutogradPrivateUse1:
       return DispatchKeySet(DispatchKey::PrivateUse1);
     case DispatchKey::AutogradPrivateUse2:
       return DispatchKeySet(DispatchKey::PrivateUse2);
     case DispatchKey::AutogradPrivateUse3:
       return DispatchKeySet(DispatchKey::PrivateUse3);
     case DispatchKey::AutogradNestedTensor:
       return DispatchKeySet(DispatchKey::NestedTensor) |
           DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
     case DispatchKey::AutogradOther:
       return autogradother_backends;
     default:
       return DispatchKeySet();
   }
 }

 bool isIncludedInAlias(DispatchKey k, DispatchKey alias) {
   return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k);
 }

 std::string toString(DispatchKeySet ts) {
   std::stringstream ss;
   ss << ts;
   return ss.str();
 }

 std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
   if (ts.empty()) {
     os << "DispatchKeySet()";
     return os;
   }
   os << "DispatchKeySet(";
   bool first = true;
   for (auto k : ts) {
     if (!first) {
       os << ", ";
     }
     os << k;
     first = false;
   }
   os << ")";
   return os;
 }

 DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() {
   TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val);
   TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends, next_backend_);

   // Create a masked version of the set representation to ignore previous
   // keys that we've iterated through.
   uint64_t masked_functionality_bits =
       llvm::maskTrailingZeros<uint64_t>(next_functionality_) & *data_ptr_;
   uint64_t masked_backend_bits =
       llvm::maskTrailingZeros<uint64_t>(next_backend_) & full_backend_mask &
       *data_ptr_;

   uint64_t first_functionality_idx =
       llvm::findFirstSet(masked_functionality_bits);
   uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits);

   // If there are no keys, set to end iterator value
   if (first_functionality_idx == std::numeric_limits<uint64_t>::max() ||
       next_functionality_ == iterator::end_iter_mask_val) {
     // Set up state to be the same as end()
     next_functionality_ = iterator::end_iter_mask_val;
     current_dispatchkey_idx_ = iterator::end_iter_key_val;
     next_backend_ = 0;
     current_backendcomponent_idx_ = iterator::end_iter_key_val;
     return *this;
   }

   // The +1 is because of DispatchKey::Undefined and
   // BackendComponent::InvalidBit
   auto new_next_functionality = first_functionality_idx + 1;
   auto new_backendcomponent_idx = first_backendcomponent_idx + 1;
   // and the -num_backends is because the first <num_backends> bits in the
   // keyset are not Dispatch Keys.
   auto next_dispatchkey_idx = new_next_functionality - num_backends;

   // If the current functionality bit is a per-backend bit, we need special
   // handling
   if (isPerBackendFunctionalityKey(
           static_cast<DispatchKey>(next_dispatchkey_idx))) {
     // case 1: if the current backend is undefined, then there is no valid
     // backend instance of this functionality key so we can skip it.
     if (first_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
       // increment the functionality mask so we skip the current functionality
       // bit on the next increment.
       next_functionality_ = new_next_functionality;
       ++(*this);
       return *this;
     }

     // Otherwise, at this point we know what the current backend and
     // functionality bits are.
     current_dispatchkey_idx_ = next_dispatchkey_idx;
     current_backendcomponent_idx_ = new_backendcomponent_idx;

     // Next, we need to set up the masks for the next increment.
     uint64_t next_backendcomponent_bits =
         llvm::maskTrailingZeros<uint64_t>(first_backendcomponent_idx + 1) &
         full_backend_mask & *data_ptr_;
     uint64_t next_backendcomponent_idx =
         llvm::findFirstSet(next_backendcomponent_bits);
     if (next_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
       // case 2: the current backend is valid, but there is not another backend
       // in the keyset. In this case, we need to bump the functionality mask and
       // reset the backend mask for the next increment
       next_functionality_ = new_next_functionality;
       next_backend_ = 0;
     } else {
       // case 3: we have another backend to iterate over. We want to iterate
       // over the same functionality bit next time, but a different backend bit.
       next_backend_ = first_backendcomponent_idx + 1;
     }
   } else {
     // Functionality bits that aren't per backend are simpler to handle. We can
     // ignore the backend bits.
     TORCH_INTERNAL_ASSERT(next_backend_ == 0);
     current_dispatchkey_idx_ = next_dispatchkey_idx;
     next_functionality_ = new_next_functionality;
   }
   return *this;
 }

 std::array<FunctionalityOffsetAndMask, num_functionality_keys>
 initializeFunctionalityOffsetsAndMasks() {
   std::array<FunctionalityOffsetAndMask, num_functionality_keys>
       offsets_and_masks;
   // manually set the first entry, which corresponds to Undefined.
   offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0);
   // loop through every functionality key (aside from Undefined).
   for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) {
     // functionality_idx should be Dense -> 1, ...
     auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1];
     auto k = static_cast<DispatchKey>(functionality_idx);

     // If the previous functionality was not per-backend, then we can just
     // increment the previous offset. Otherwise, the next offset =
     // previous_offset + num_backends.
     auto next_offset = prev_offset_and_mask.offset +
         (prev_offset_and_mask.mask == 0 ? 1 : num_backends);
     // the mask is used in the runtime index calculation to find the offset of
     // the backend. For non-per-backend functionalities, this offset should
     // always be 0. Otherwise, we need to get the index of the backend (which we
     // can do using a backend mask).
     auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0;
     offsets_and_masks[functionality_idx] =
         FunctionalityOffsetAndMask(next_offset, next_mask);
   }
   // Sanity check that the computed offset index of the last functionality key
   // is correct. This assumes that the highest priority functionality key is not
   // per backend.
   TORCH_INTERNAL_ASSERT(
       offsets_and_masks[num_functionality_keys - 1].offset ==
           (num_runtime_entries - 1),
       "num_runtime_entries: ",
       num_runtime_entries,
       "last_offset: ",
       offsets_and_masks[num_functionality_keys - 1].offset);
   return offsets_and_masks;
 }

 } // namespace c10
	#include <c10/core/DispatchKeySet.h>
	#include <c10/util/irange.h>

	namespace c10 {

	// backend_dispatch_keyset includes all dispatch keys that map to backends.
	// Alias key DispatchKey::CompositeExplicitAutograd maps to
	// backend_dispatch_keyset
	constexpr DispatchKeySet backend_dispatch_keyset =
	autogradother_backends \| DispatchKeySet(DispatchKey::Dense);

	// See Note [CompositeExplicitAutogradNonFunctional Key]
	// We have several types of decompositions in aten, that each have their own
	// alias key. You should register your decomposition to the
	// `CompositeExplicitAutogradNonFunctional key` if: (1) It's an out-of-place op
	// (2) It decomposes into one more mutation ops
	// (3) It has a derivative formula
	// (In theory we could also have a separate key for
	// "CompositeImplicitAutogradNonFunctional", but there isn't much of a use
	// case for it currently).
	// This key is important for "functional" backends like LazyTensor / XLA.
	// If you're a backend that only expects to deal with "functional ops",
	// then you don't want to decompose a functional op into an op that causes
	// aliasing. You should just directly write a kernel for that functional op
	// instead!
	constexpr DispatchKeySet non_functional_backend_dispatch_keyset =
	backend_dispatch_keyset
	// XLA and LazyTensor are currently the only 2 backends in core
	// that use functionalization pass in eager mode.
	.remove(DispatchKey::Sparse)
	.remove_backend(BackendComponent::XLABit)
	.remove_backend(BackendComponent::LazyBit);

	bool isBackendDispatchKey(DispatchKey t) {
	return t != DispatchKey::Undefined
	// See Note [No Alias Keys in DispatchKeySet]
	&& !isAliasDispatchKey(t)
	// Note [NestedTensor Not Included in Backend Keys]
	// NestedTensor has been explicitly removed from the "backend keyset" due
	// to incompatibility with some kernels, so we don't want it to be
	// included in CompositeExplicitAutograd kernels.
	&& t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t);
	}

	// math_dispatch_keyset contains all keys in backend_dispatch_keyset and
	// autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd
	// maps to [math_dispatch_keyset x full_backend_mask]
	constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset \|
	autograd_dispatch_keyset \|
	// See Note [NestedTensor Not Included in Backend Keys]
	// The caveat to that note is that nested_tensor is a special case
	// where we would like to support composite implicit kernels but not
	// explicit kernels therefore we manually add the key to the
	// math_dispatch_keyset
	DispatchKeySet{DispatchKey::NestedTensor} \|
	// Functionalize should always re-use CompositeImplicit decomps.
	DispatchKeySet{DispatchKey::Functionalize};

	constexpr DispatchKeySet nested_dispatch_keyset =
	DispatchKeySet(
	{DispatchKey::AutogradNestedTensor, DispatchKey::NestedTensor}) \|
	DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);

	DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
	TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
	switch (t) {
	case DispatchKey::Autograd:
	// See Note [autograd_dispatch_keyset Does Not Include Backend Bits]
	// That's why we OR it with a mask of the backend bits here.
	// getRuntimeDispatchKeySet() expects to return a keyset of runtime
	// dispatch keys, like AutogradCPU, but that requires having backend bits.
	return autograd_dispatch_keyset \|
	DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
	case DispatchKey::CompositeImplicitAutograd:
	return math_dispatch_keyset;
	case DispatchKey::CompositeImplicitAutogradNestedTensor:
	return nested_dispatch_keyset;
	case DispatchKey::CompositeExplicitAutograd:
	return backend_dispatch_keyset;
	case DispatchKey::CompositeExplicitAutogradNonFunctional:
	return non_functional_backend_dispatch_keyset;
	default:
	return DispatchKeySet(t);
	}
	}

	bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) {
	TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
	switch (t) {
	case DispatchKey::Autograd:
	return autograd_dispatch_keyset.has(toFunctionalityKey(k));
	case DispatchKey::CompositeImplicitAutograd:
	// See Note [NestedTensor Not Included in Backend Keys]
	return math_dispatch_keyset.has(k);
	case DispatchKey::CompositeImplicitAutogradNestedTensor:
	// See Note [NestedTensor Not Included in Backend Keys]
	return nested_dispatch_keyset.has(k);
	case DispatchKey::CompositeExplicitAutograd:
	// See Note [NestedTensor Not Included in Backend Keys]
	return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k);
	case DispatchKey::CompositeExplicitAutogradNonFunctional:
	// See Note [NestedTensor Not Included in Backend Keys]
	return k != DispatchKey::NestedTensor &&
	non_functional_backend_dispatch_keyset.has(k);
	case DispatchKey::FuncTorchBatchedDecomposition:
	return functorch_batched_ks.has(k);
	default:
	return t == k;
	}
	}

	// for a given autograd key, return the (guaranteed nonempty) set of associated
	// backend keys. for a non-autograd key, return the empty keyset.
	DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
	switch (t) {
	case DispatchKey::AutogradCPU:
	return DispatchKeySet(DispatchKey::CPU);
	case DispatchKey::AutogradCUDA:
	return DispatchKeySet(DispatchKey::CUDA);
	case DispatchKey::AutogradXLA:
	return DispatchKeySet(DispatchKey::XLA);
	case DispatchKey::AutogradLazy:
	return DispatchKeySet(DispatchKey::Lazy);
	case DispatchKey::AutogradMeta:
	return DispatchKeySet(DispatchKey::Meta);
	case DispatchKey::AutogradMPS:
	return DispatchKeySet(DispatchKey::MPS);
	case DispatchKey::AutogradHPU:
	return DispatchKeySet(DispatchKey::HPU);
	case DispatchKey::AutogradIPU:
	return DispatchKeySet(DispatchKey::IPU);
	case DispatchKey::AutogradXPU:
	return DispatchKeySet(DispatchKey::XPU);
	case DispatchKey::AutogradPrivateUse1:
	return DispatchKeySet(DispatchKey::PrivateUse1);
	case DispatchKey::AutogradPrivateUse2:
	return DispatchKeySet(DispatchKey::PrivateUse2);
	case DispatchKey::AutogradPrivateUse3:
	return DispatchKeySet(DispatchKey::PrivateUse3);
	case DispatchKey::AutogradNestedTensor:
	return DispatchKeySet(DispatchKey::NestedTensor) \|
	DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
	case DispatchKey::AutogradOther:
	return autogradother_backends;
	default:
	return DispatchKeySet();
	}
	}

	bool isIncludedInAlias(DispatchKey k, DispatchKey alias) {
	return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k);
	}

	std::string toString(DispatchKeySet ts) {
	std::stringstream ss;
	ss << ts;
	return ss.str();
	}

	std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
	if (ts.empty()) {
	os << "DispatchKeySet()";
	return os;
	}
	os << "DispatchKeySet(";
	bool first = true;
	for (auto k : ts) {
	if (!first) {
	os << ", ";
	}
	os << k;
	first = false;
	}
	os << ")";
	return os;
	}

	DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() {
	TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val);
	TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends, next_backend_);

	// Create a masked version of the set representation to ignore previous
	// keys that we've iterated through.
	uint64_t masked_functionality_bits =
	llvm::maskTrailingZeros<uint64_t>(next_functionality_) & *data_ptr_;
	uint64_t masked_backend_bits =
	llvm::maskTrailingZeros<uint64_t>(next_backend_) & full_backend_mask &
	*data_ptr_;

	uint64_t first_functionality_idx =
	llvm::findFirstSet(masked_functionality_bits);
	uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits);

	// If there are no keys, set to end iterator value
	if (first_functionality_idx == std::numeric_limits<uint64_t>::max() \|\|
	next_functionality_ == iterator::end_iter_mask_val) {
	// Set up state to be the same as end()
	next_functionality_ = iterator::end_iter_mask_val;
	current_dispatchkey_idx_ = iterator::end_iter_key_val;
	next_backend_ = 0;
	current_backendcomponent_idx_ = iterator::end_iter_key_val;
	return *this;
	}

	// The +1 is because of DispatchKey::Undefined and
	// BackendComponent::InvalidBit
	auto new_next_functionality = first_functionality_idx + 1;
	auto new_backendcomponent_idx = first_backendcomponent_idx + 1;
	// and the -num_backends is because the first <num_backends> bits in the
	// keyset are not Dispatch Keys.
	auto next_dispatchkey_idx = new_next_functionality - num_backends;

	// If the current functionality bit is a per-backend bit, we need special
	// handling
	if (isPerBackendFunctionalityKey(
	static_cast<DispatchKey>(next_dispatchkey_idx))) {
	// case 1: if the current backend is undefined, then there is no valid
	// backend instance of this functionality key so we can skip it.
	if (first_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
	// increment the functionality mask so we skip the current functionality
	// bit on the next increment.
	next_functionality_ = new_next_functionality;
	++(*this);
	return *this;
	}

	// Otherwise, at this point we know what the current backend and
	// functionality bits are.
	current_dispatchkey_idx_ = next_dispatchkey_idx;
	current_backendcomponent_idx_ = new_backendcomponent_idx;

	// Next, we need to set up the masks for the next increment.
	uint64_t next_backendcomponent_bits =
	llvm::maskTrailingZeros<uint64_t>(first_backendcomponent_idx + 1) &
	full_backend_mask & *data_ptr_;
	uint64_t next_backendcomponent_idx =
	llvm::findFirstSet(next_backendcomponent_bits);
	if (next_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
	// case 2: the current backend is valid, but there is not another backend
	// in the keyset. In this case, we need to bump the functionality mask and
	// reset the backend mask for the next increment
	next_functionality_ = new_next_functionality;
	next_backend_ = 0;
	} else {
	// case 3: we have another backend to iterate over. We want to iterate
	// over the same functionality bit next time, but a different backend bit.
	next_backend_ = first_backendcomponent_idx + 1;
	}
	} else {
	// Functionality bits that aren't per backend are simpler to handle. We can
	// ignore the backend bits.
	TORCH_INTERNAL_ASSERT(next_backend_ == 0);
	current_dispatchkey_idx_ = next_dispatchkey_idx;
	next_functionality_ = new_next_functionality;
	}
	return *this;
	}

	std::array<FunctionalityOffsetAndMask, num_functionality_keys>
	initializeFunctionalityOffsetsAndMasks() {
	std::array<FunctionalityOffsetAndMask, num_functionality_keys>
	offsets_and_masks;
	// manually set the first entry, which corresponds to Undefined.
	offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0);
	// loop through every functionality key (aside from Undefined).
	for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) {
	// functionality_idx should be Dense -> 1, ...
	auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1];
	auto k = static_cast<DispatchKey>(functionality_idx);

	// If the previous functionality was not per-backend, then we can just
	// increment the previous offset. Otherwise, the next offset =
	// previous_offset + num_backends.
	auto next_offset = prev_offset_and_mask.offset +
	(prev_offset_and_mask.mask == 0 ? 1 : num_backends);
	// the mask is used in the runtime index calculation to find the offset of
	// the backend. For non-per-backend functionalities, this offset should
	// always be 0. Otherwise, we need to get the index of the backend (which we
	// can do using a backend mask).
	auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0;
	offsets_and_masks[functionality_idx] =
	FunctionalityOffsetAndMask(next_offset, next_mask);
	}
	// Sanity check that the computed offset index of the last functionality key
	// is correct. This assumes that the highest priority functionality key is not
	// per backend.
	TORCH_INTERNAL_ASSERT(
	offsets_and_masks[num_functionality_keys - 1].offset ==
	(num_runtime_entries - 1),
	"num_runtime_entries: ",
	num_runtime_entries,
	"last_offset: ",
	offsets_and_masks[num_functionality_keys - 1].offset);
	return offsets_and_masks;
	}

	} // namespace c10