src/cache.c - platform/external/XNNPACK - Git at Google

 // Copyright 2022 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.

 #include "xnnpack/cache.h"

 #include <assert.h> // For assert.
 #include <stddef.h> // For size_t.
 #include <stdint.h> // For uint32_t.

 #include "xnnpack.h"
 #include "xnnpack/allocator.h"
 #include "xnnpack/log.h"
 #include "xnnpack/math.h"
 #include "xnnpack/mutex.h"

 #define XNN_CACHE_HASH_SEED 7
 #define XNN_CACHE_INITIAL_BUCKETS 32
 #define XNN_CACHE_MAX_LOAD 0.75
 // Max load factor is 0.75 (3/4), i.e. num_entries / num_buckets > 3 / 4.
 #define XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER 4
 #define XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER 3
 #define XNN_CACHE_GROWTH_FACTOR 2

 // MurmurHash3 implementation, copied from smhasher, with minor modifications in
 // style and main loop.

 static inline uint32_t fmix32(uint32_t h)
 {
   h ^= h >> 16;
   h *= UINT32_C(0x85EBCA6B);
   h ^= h >> 13;
   h *= UINT32_C(0xC2B2AE35);
   h ^= h >> 16;

   return h;
 }

 static uint32_t murmur_hash3(const void* key, size_t len, uint32_t seed)
 {
   const uint8_t* data = (const uint8_t*) key;

   uint32_t h1 = seed;

   const uint32_t c1 = UINT32_C(0xCC9E2D51);
   const uint32_t c2 = UINT32_C(0x1B873593);

   const uint32_t* blocks = (const uint32_t*) data;
   for (; len >= sizeof(uint32_t); len -= sizeof(uint32_t)) {
     uint32_t k1 = *blocks++;

     k1 *= c1;
     k1 = math_rotl_u32(k1, 15);
     k1 *= c2;

     h1 ^= k1;
     h1 = math_rotl_u32(h1, 13);
     h1 = h1 * 5 + UINT32_C(0xE6546B64);
   }

   const uint8_t* tail = (const uint8_t*) blocks;

   uint32_t k1 = 0;

   switch (len & 3) {
     case 3:
       k1 ^= tail[2] << 16;
     case 2:
       k1 ^= tail[1] << 8;
     case 1:
       k1 ^= tail[0];
       k1 *= c1;
       k1 = math_rotl_u32(k1, 15);
       k1 *= c2;
       h1 ^= k1;
   };

   h1 ^= len;

   return fmix32(h1);
 }

 #ifndef NDEBUG
 // This function is only used by an assert, so do not include it in non-debug
 // builds.
 static inline size_t cache_size(struct xnn_cache* cache) {
   switch (cache->type) {
     case xnn_cache_type_code:
       return cache->code.size;
     case xnn_cache_type_weights:
       return cache->weights.size;
     default:
       XNN_UNREACHABLE;
   }
   return SIZE_MAX;
 }
 #endif

 static inline void* cache_start(struct xnn_cache* cache) {
   switch (cache->type) {
     case xnn_cache_type_code:
       return cache->code.start;
     case xnn_cache_type_weights:
       return cache->weights.start;
     default:
       XNN_UNREACHABLE;
   }
   return NULL;
 }

 enum xnn_status xnn_init_cache_with_size(struct xnn_cache* cache, size_t num_buckets, enum xnn_cache_type cache_type)
 {
   memset(cache, 0, sizeof(struct xnn_cache));
   cache->buckets = (struct xnn_cache_bucket*) xnn_allocate_zero_memory(num_buckets * sizeof(struct xnn_cache_bucket));
   if (cache->buckets == NULL) {
     xnn_log_error("fail to allocate memory for cache buckets");
     return xnn_status_out_of_memory;
   }

   cache->type = cache_type;
   cache->num_buckets = num_buckets;
   return xnn_status_success;
 }

 enum xnn_status xnn_init_code_cache_with_size(struct xnn_code_cache* cache, size_t num_buckets)
 {
   memset(cache, 0, sizeof(struct xnn_code_cache));
   enum xnn_status status = xnn_status_success;
   status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_code);
   if (status != xnn_status_success) {
     goto error;
   }

   status = xnn_allocate_code_memory(&cache->cache.code, XNN_DEFAULT_CODE_BUFFER_SIZE);
   if (status != xnn_status_success) {
     goto error;
   }

   return xnn_status_success;

 error:
   xnn_release_code_cache(cache);
   return status;
 }

 enum xnn_status xnn_init_code_cache(struct xnn_code_cache* cache)
 {
   return xnn_init_code_cache_with_size(cache, XNN_CACHE_INITIAL_BUCKETS);
 }

 static bool cache_buckets_grow(struct xnn_cache* cache)
 {
   const size_t new_num_buckets = cache->num_buckets * XNN_CACHE_GROWTH_FACTOR;
   assert(is_po2(new_num_buckets));
   struct xnn_cache tmp_cache;
   xnn_init_cache_with_size(&tmp_cache, new_num_buckets, cache->type);

   for (size_t i = 0; i < cache->num_buckets; i++) {
     struct xnn_cache_bucket b = cache->buckets[i];
     if (b.size == 0) {
       continue;
     }

     // Find the first empty slot by linear probing to insert. No need to check
     // hashes since we are not looking up anything, just moving things around
     // into a bigger hash table.
     const size_t mask = tmp_cache.num_buckets - 1;
     size_t idx = b.hash & mask;
     while (tmp_cache.buckets[idx].size != 0) {
       idx = (idx + 1) & mask;
     }
     tmp_cache.buckets[idx].hash = b.hash;
     tmp_cache.buckets[idx].size = b.size;
     tmp_cache.buckets[idx].offset = b.offset;
   }

   xnn_release_memory(cache->buckets);

   cache->buckets = tmp_cache.buckets;
   cache->num_buckets = tmp_cache.num_buckets;
   return true;
 }

 static inline bool bytes_equal(struct xnn_cache* cache, void* ptr, size_t size, size_t offset)
 {
   return memcmp(ptr, (void*) ((uintptr_t) cache_start(cache) + offset), size) == 0;
 }

 static bool lookup(struct xnn_cache* cache, void* ptr, size_t size, uint32_t hash, size_t* index)
 {
   assert(is_po2(cache->num_buckets));
   const size_t mask = cache->num_buckets - 1;
   size_t idx = hash & mask;
   const struct xnn_cache_bucket* buckets = cache->buckets;

   // Linear probing.
   while (buckets[idx].size != 0 &&
          !(buckets[idx].hash == hash &&
            size == buckets[idx].size &&
            bytes_equal(cache, ptr, buckets[idx].size, buckets[idx].offset))) {
     idx = (idx + 1) & mask;
   }
   *index = idx;
   if (buckets[idx].size == 0) {
     return false;
   } else {
     return true;
   }
 }

 static bool insert(struct xnn_cache* cache, void* ptr, size_t size)
 {
   const uint32_t hash = murmur_hash3(ptr, size, /*seed=*/XNN_CACHE_HASH_SEED);
   size_t idx;
   const bool found = lookup(cache, ptr, size, hash, &idx);
   if (found) {
     return false;
   }

   // Ensure we have enough buckets to keep under our load limit.
   if (cache->num_entries * XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER >
       cache->num_buckets * XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER) {
     if (!cache_buckets_grow(cache)) {
       // Can't grow hash table anymore.
       xnn_log_error("failed to grow cache buckets");
       return false;
     }
     xnn_log_debug("successfully grew cache buckets");

     // If the cache grew, idx is stale, since that is based on the old cache's num_buckets.
     const bool found_in_grown_cache = lookup(cache, ptr, size, hash, &idx);
     assert(!found_in_grown_cache);
     (void) found_in_grown_cache;  // Silence unused variable warnings.
   }

   // Check that ptr points into cache's buffer.
   assert((uintptr_t) ptr >= (uintptr_t) cache_start(cache));
   if (cache->type == xnn_cache_type_code) {
     assert((uintptr_t) ptr < (uintptr_t) cache_start(cache) + cache_size(cache));
   }

   const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache);

   // Insert the entry.
   cache->buckets[idx].size = size;
   cache->buckets[idx].hash = hash;
   cache->buckets[idx].offset = offset;
   cache->num_entries++;
   return true;
 }

 // Checks if a generated microkernel is already in the cache, returns the offset
 // if found, XNN_CACHE_NOT_FOUND otherwise.
 static size_t lookup_cache(struct xnn_cache* cache, void* ptr, size_t size)
 {
   const uint32_t hash = murmur_hash3(ptr, size, /*seed=*/XNN_CACHE_HASH_SEED);
   size_t bucket_idx;
   if (lookup(cache, ptr, size, hash, &bucket_idx)) {
     cache->hits++;
     return cache->buckets[bucket_idx].offset;
   } else {
     cache->misses++;
     return XNN_CACHE_NOT_FOUND;
   }
 }

 size_t xnn_get_or_insert_cache(struct xnn_cache* cache, void* ptr, size_t size)
 {
   const size_t found_offset = lookup_cache(cache, ptr, size);
   if (found_offset != XNN_CACHE_NOT_FOUND) {
     if (cache->type == xnn_cache_type_code) {
       // Found in the cache, rewind the buffer because code generators update buffer size.
       cache->code.size -= size;
     }
     return found_offset;
   }

   if (cache->type == xnn_cache_type_weights) {
     // Cache miss, weights packing functions don't update buffer size, update it here.
     cache->weights.size += size;
   }

   const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache);
   if (!insert(cache, ptr, size)) {
     return XNN_CACHE_NOT_FOUND;
   }
   return offset;
 }

 size_t xnn_get_or_insert_code_cache(struct xnn_code_cache* cache, void* ptr, size_t size)
 {
   return xnn_get_or_insert_cache(&cache->cache, ptr, size);
 }

 enum xnn_status xnn_release_code_cache(struct xnn_code_cache* cache)
 {
   if XNN_LIKELY(cache != NULL) {
     assert(cache->cache.type == xnn_cache_type_code);
     xnn_release_code_memory(&cache->cache.code);
     xnn_release_memory(cache->cache.buckets);
   }
   return xnn_status_success;
 }

 enum xnn_status xnn_internal_init_weights_cache(
   struct xnn_weights_cache* cache,
   size_t num_buckets,
   size_t buffer_size)
 {
   memset(cache, 0, sizeof(struct xnn_weights_cache));

   enum xnn_status status = xnn_status_success;
   status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_weights);
   if (status != xnn_status_success) {
     goto error;
   }

   status = xnn_allocate_weights_memory(&cache->cache.weights, buffer_size);
   if (status != xnn_status_success) {
     goto error;
   }

   status = xnn_mutex_init(&cache->mutex);
   if (status != xnn_status_success) {
     goto error;
   }

   return xnn_status_success;

 error:
   xnn_release_weights_cache(cache);
   return status;
 }

 enum xnn_status xnn_init_weights_cache_with_size(struct xnn_weights_cache* cache, size_t size)
 {
   return xnn_internal_init_weights_cache(cache, XNN_CACHE_INITIAL_BUCKETS, size);
 }

 enum xnn_status xnn_init_weights_cache(struct xnn_weights_cache* cache)
 {
   return xnn_init_weights_cache_with_size(cache, XNN_DEFAULT_WEIGHTS_BUFFER_SIZE);
 }

 enum xnn_status xnn_finalize_weights_cache(
   struct xnn_weights_cache* cache,
   enum xnn_weights_cache_finalization_kind finalization_kind)
 {
   switch (cache->finalization_state) {
     case xnn_cache_state_hard_finalized:
     case xnn_cache_state_soft_finalized:
       xnn_log_error("failed to finalize an already final weights cache");
       return xnn_status_invalid_state;
     case xnn_cache_state_not_finalized: {
       enum xnn_status status;
       enum xnn_cache_state finalized_state;

       if (finalization_kind == xnn_weights_cache_finalization_kind_hard) {
         xnn_log_debug("hard finalizing weights cache");
         status = xnn_finalize_weights_memory(&cache->cache.weights);
         // Also release the memory used by hash table (but not the weights memory).
         xnn_release_memory(cache->cache.buckets);
         cache->cache.buckets = NULL;
         finalized_state = xnn_cache_state_hard_finalized;
       } else {
         xnn_log_debug("soft finalizing weights cache");
         assert(finalization_kind == xnn_weights_cache_finalization_kind_soft);
         // Finalize weights cache by reserving sufficient space for the insertion of the largest cached weights. This
         // ensures that we have space to write packed weights to check for cache hits without growing and moving the
         // memory. This has some memory overhead, which can be as large as the size of the largest cached weights,
         // rounded up to page size.
         status = xnn_reserve_weights_memory(&cache->cache.weights, cache->max_weights_size);
         finalized_state = xnn_cache_state_soft_finalized;
       }
       if (status != xnn_status_success) {
         xnn_log_error("failed to finalize weights cache memory");
         return xnn_status_invalid_state;
       }

       cache->finalization_state = finalized_state;
       return xnn_status_success;
     }
   }
 }

 enum xnn_status xnn_release_weights_cache(struct xnn_weights_cache* cache)
 {
   if XNN_LIKELY(cache != NULL) {
     assert(cache->cache.type == xnn_cache_type_weights);
     xnn_release_weights_memory(&cache->cache.weights);
     if (cache->cache.buckets != NULL) {
       xnn_release_memory(cache->cache.buckets);
     }
     const enum xnn_status status = xnn_mutex_destroy(&cache->mutex);
     if (status != xnn_status_success) {
       return status;
     }
   }
   return xnn_status_success;
 }

 static inline bool cache_has_space(struct xnn_weights_cache* cache, size_t n)
 {
   const struct xnn_weights_buffer buf = cache->cache.weights;
   return buf.size + n <= buf.capacity;
 }

 void* xnn_reserve_space_in_weights_cache(struct xnn_weights_cache* cache, size_t n) {
   switch (cache->finalization_state) {
     case xnn_cache_state_hard_finalized:
       xnn_log_error("cannot reserve additional space in a finalized compact weights cache");
       return NULL;
     case xnn_cache_state_soft_finalized:
       if (!cache_has_space(cache, n)) {
         xnn_log_error("cannot reserve additional space in a finalized weights cache");
         return NULL;
       }
       // If the cache is finalized, and has space for `n` bytes, we still want to lock the mutex, because we can have
       // multiple writers attempting to write to this space.
       break;
     default:
       break;
   }

   enum xnn_status status = xnn_mutex_lock(&cache->mutex);
   if (status != xnn_status_success) {
     return NULL;
   }

   struct xnn_weights_buffer* buffer = &cache->cache.weights;
   status = xnn_reserve_weights_memory(buffer, n);
   if (status != xnn_status_success) {
     xnn_mutex_unlock(&cache->mutex);
     return NULL;
   }

   return (void*) ((uintptr_t) buffer->start + buffer->size);
 }

 size_t xnn_get_or_insert_weights_cache(struct xnn_weights_cache* cache, void* ptr, size_t size)
 {
   size_t offset = XNN_CACHE_NOT_FOUND;

   switch (cache->finalization_state) {
     case xnn_cache_state_hard_finalized: {
       xnn_log_error("cannot insert into a finalized compact weights cache");
       return XNN_CACHE_NOT_FOUND;
     }
     case xnn_cache_state_soft_finalized: {
       // Inserting into a finalized weights cache is okay as long as:
       // 1. there is sufficient space in the memory (to write the incoming packed weights), or
       // 2. incoming packed weights is already in cache
       if (!cache_has_space(cache, size)) {
         xnn_log_error("insufficient extra space in finalized weights cache buffer");
         return XNN_CACHE_NOT_FOUND;
       }

       // We need to release the mutex from this point onwards, because xnn_reserve_space_in_weights would have returned
       // non-NULL (which means that it locked the mutex).
       const size_t found_offset = lookup_cache(&cache->cache, ptr, size);
       if (found_offset == XNN_CACHE_NOT_FOUND) {
         xnn_log_error("packed weights not found in finalized weights cache");
       }

       offset = found_offset;
       break;
     }
     case xnn_cache_state_not_finalized: {
       offset = xnn_get_or_insert_cache(&cache->cache, ptr, size);
       if (offset != XNN_CACHE_NOT_FOUND) {
         // Found or inserted packed weights, update the largest size seen so far, this will be used when finalizing the
         // weights cache, to ensure there is an extra space at the end for future cache checks.
         cache->max_weights_size = max(size, cache->max_weights_size);
       }
       break;
     }
   }

   // Mutex is locked in xnn_reserve_space_in_weights_cache when it returns non-NULL, i.e. when cache is not finalized,
   // or if it is xnn_cache_state_soft_finalized and has sufficient space.
   const enum xnn_status status = xnn_mutex_unlock(&cache->mutex);
   (void) status;
   assert(status == xnn_status_success);
   return offset;
 }

 bool xnn_weights_cache_is_finalized(struct xnn_weights_cache* cache) {
   return cache->finalization_state != xnn_cache_state_not_finalized;
 }
	// Copyright 2022 Google LLC
	//
	// This source code is licensed under the BSD-style license found in the
	// LICENSE file in the root directory of this source tree.

	#include "xnnpack/cache.h"

	#include <assert.h> // For assert.
	#include <stddef.h> // For size_t.
	#include <stdint.h> // For uint32_t.

	#include "xnnpack.h"
	#include "xnnpack/allocator.h"
	#include "xnnpack/log.h"
	#include "xnnpack/math.h"
	#include "xnnpack/mutex.h"

	#define XNN_CACHE_HASH_SEED 7
	#define XNN_CACHE_INITIAL_BUCKETS 32
	#define XNN_CACHE_MAX_LOAD 0.75
	// Max load factor is 0.75 (3/4), i.e. num_entries / num_buckets > 3 / 4.
	#define XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER 4
	#define XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER 3
	#define XNN_CACHE_GROWTH_FACTOR 2

	// MurmurHash3 implementation, copied from smhasher, with minor modifications in
	// style and main loop.

	static inline uint32_t fmix32(uint32_t h)
	{
	h ^= h >> 16;
	h *= UINT32_C(0x85EBCA6B);
	h ^= h >> 13;
	h *= UINT32_C(0xC2B2AE35);
	h ^= h >> 16;

	return h;
	}

	static uint32_t murmur_hash3(const void* key, size_t len, uint32_t seed)
	{
	const uint8_t* data = (const uint8_t*) key;

	uint32_t h1 = seed;

	const uint32_t c1 = UINT32_C(0xCC9E2D51);
	const uint32_t c2 = UINT32_C(0x1B873593);

	const uint32_t* blocks = (const uint32_t*) data;
	for (; len >= sizeof(uint32_t); len -= sizeof(uint32_t)) {
	uint32_t k1 = *blocks++;

	k1 *= c1;
	k1 = math_rotl_u32(k1, 15);
	k1 *= c2;

	h1 ^= k1;
	h1 = math_rotl_u32(h1, 13);
	h1 = h1 * 5 + UINT32_C(0xE6546B64);
	}

	const uint8_t* tail = (const uint8_t*) blocks;

	uint32_t k1 = 0;

	switch (len & 3) {
	case 3:
	k1 ^= tail[2] << 16;
	case 2:
	k1 ^= tail[1] << 8;
	case 1:
	k1 ^= tail[0];
	k1 *= c1;
	k1 = math_rotl_u32(k1, 15);
	k1 *= c2;
	h1 ^= k1;
	};

	h1 ^= len;

	return fmix32(h1);
	}

	#ifndef NDEBUG
	// This function is only used by an assert, so do not include it in non-debug
	// builds.
	static inline size_t cache_size(struct xnn_cache* cache) {
	switch (cache->type) {
	case xnn_cache_type_code:
	return cache->code.size;
	case xnn_cache_type_weights:
	return cache->weights.size;
	default:
	XNN_UNREACHABLE;
	}
	return SIZE_MAX;
	}
	#endif

	static inline void* cache_start(struct xnn_cache* cache) {
	switch (cache->type) {
	case xnn_cache_type_code:
	return cache->code.start;
	case xnn_cache_type_weights:
	return cache->weights.start;
	default:
	XNN_UNREACHABLE;
	}
	return NULL;
	}

	enum xnn_status xnn_init_cache_with_size(struct xnn_cache* cache, size_t num_buckets, enum xnn_cache_type cache_type)
	{
	memset(cache, 0, sizeof(struct xnn_cache));
	cache->buckets = (struct xnn_cache_bucket) xnn_allocate_zero_memory(num_buckets sizeof(struct xnn_cache_bucket));
	if (cache->buckets == NULL) {
	xnn_log_error("fail to allocate memory for cache buckets");
	return xnn_status_out_of_memory;
	}

	cache->type = cache_type;
	cache->num_buckets = num_buckets;
	return xnn_status_success;
	}

	enum xnn_status xnn_init_code_cache_with_size(struct xnn_code_cache* cache, size_t num_buckets)
	{
	memset(cache, 0, sizeof(struct xnn_code_cache));
	enum xnn_status status = xnn_status_success;
	status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_code);
	if (status != xnn_status_success) {
	goto error;
	}

	status = xnn_allocate_code_memory(&cache->cache.code, XNN_DEFAULT_CODE_BUFFER_SIZE);
	if (status != xnn_status_success) {
	goto error;
	}

	return xnn_status_success;

	error:
	xnn_release_code_cache(cache);
	return status;
	}

	enum xnn_status xnn_init_code_cache(struct xnn_code_cache* cache)
	{
	return xnn_init_code_cache_with_size(cache, XNN_CACHE_INITIAL_BUCKETS);
	}

	static bool cache_buckets_grow(struct xnn_cache* cache)
	{
	const size_t new_num_buckets = cache->num_buckets * XNN_CACHE_GROWTH_FACTOR;
	assert(is_po2(new_num_buckets));
	struct xnn_cache tmp_cache;
	xnn_init_cache_with_size(&tmp_cache, new_num_buckets, cache->type);

	for (size_t i = 0; i < cache->num_buckets; i++) {
	struct xnn_cache_bucket b = cache->buckets[i];
	if (b.size == 0) {
	continue;
	}

	// Find the first empty slot by linear probing to insert. No need to check
	// hashes since we are not looking up anything, just moving things around
	// into a bigger hash table.
	const size_t mask = tmp_cache.num_buckets - 1;
	size_t idx = b.hash & mask;
	while (tmp_cache.buckets[idx].size != 0) {
	idx = (idx + 1) & mask;
	}
	tmp_cache.buckets[idx].hash = b.hash;
	tmp_cache.buckets[idx].size = b.size;
	tmp_cache.buckets[idx].offset = b.offset;
	}

	xnn_release_memory(cache->buckets);

	cache->buckets = tmp_cache.buckets;
	cache->num_buckets = tmp_cache.num_buckets;
	return true;
	}

	static inline bool bytes_equal(struct xnn_cache* cache, void* ptr, size_t size, size_t offset)
	{
	return memcmp(ptr, (void*) ((uintptr_t) cache_start(cache) + offset), size) == 0;
	}

	static bool lookup(struct xnn_cache* cache, void* ptr, size_t size, uint32_t hash, size_t* index)
	{
	assert(is_po2(cache->num_buckets));
	const size_t mask = cache->num_buckets - 1;
	size_t idx = hash & mask;
	const struct xnn_cache_bucket* buckets = cache->buckets;

	// Linear probing.
	while (buckets[idx].size != 0 &&
	!(buckets[idx].hash == hash &&
	size == buckets[idx].size &&
	bytes_equal(cache, ptr, buckets[idx].size, buckets[idx].offset))) {
	idx = (idx + 1) & mask;
	}
	*index = idx;
	if (buckets[idx].size == 0) {
	return false;
	} else {
	return true;
	}
	}

	static bool insert(struct xnn_cache* cache, void* ptr, size_t size)
	{
	const uint32_t hash = murmur_hash3(ptr, size, /seed=/XNN_CACHE_HASH_SEED);
	size_t idx;
	const bool found = lookup(cache, ptr, size, hash, &idx);
	if (found) {
	return false;
	}

	// Ensure we have enough buckets to keep under our load limit.
	if (cache->num_entries * XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER >
	cache->num_buckets * XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER) {
	if (!cache_buckets_grow(cache)) {
	// Can't grow hash table anymore.
	xnn_log_error("failed to grow cache buckets");
	return false;
	}
	xnn_log_debug("successfully grew cache buckets");

	// If the cache grew, idx is stale, since that is based on the old cache's num_buckets.
	const bool found_in_grown_cache = lookup(cache, ptr, size, hash, &idx);
	assert(!found_in_grown_cache);
	(void) found_in_grown_cache; // Silence unused variable warnings.
	}

	// Check that ptr points into cache's buffer.
	assert((uintptr_t) ptr >= (uintptr_t) cache_start(cache));
	if (cache->type == xnn_cache_type_code) {
	assert((uintptr_t) ptr < (uintptr_t) cache_start(cache) + cache_size(cache));
	}

	const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache);

	// Insert the entry.
	cache->buckets[idx].size = size;
	cache->buckets[idx].hash = hash;
	cache->buckets[idx].offset = offset;
	cache->num_entries++;
	return true;
	}

	// Checks if a generated microkernel is already in the cache, returns the offset
	// if found, XNN_CACHE_NOT_FOUND otherwise.
	static size_t lookup_cache(struct xnn_cache* cache, void* ptr, size_t size)
	{
	const uint32_t hash = murmur_hash3(ptr, size, /seed=/XNN_CACHE_HASH_SEED);
	size_t bucket_idx;
	if (lookup(cache, ptr, size, hash, &bucket_idx)) {
	cache->hits++;
	return cache->buckets[bucket_idx].offset;
	} else {
	cache->misses++;
	return XNN_CACHE_NOT_FOUND;
	}
	}

	size_t xnn_get_or_insert_cache(struct xnn_cache* cache, void* ptr, size_t size)
	{
	const size_t found_offset = lookup_cache(cache, ptr, size);
	if (found_offset != XNN_CACHE_NOT_FOUND) {
	if (cache->type == xnn_cache_type_code) {
	// Found in the cache, rewind the buffer because code generators update buffer size.
	cache->code.size -= size;
	}
	return found_offset;
	}

	if (cache->type == xnn_cache_type_weights) {
	// Cache miss, weights packing functions don't update buffer size, update it here.
	cache->weights.size += size;
	}

	const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache);
	if (!insert(cache, ptr, size)) {
	return XNN_CACHE_NOT_FOUND;
	}
	return offset;
	}

	size_t xnn_get_or_insert_code_cache(struct xnn_code_cache* cache, void* ptr, size_t size)
	{
	return xnn_get_or_insert_cache(&cache->cache, ptr, size);
	}

	enum xnn_status xnn_release_code_cache(struct xnn_code_cache* cache)
	{
	if XNN_LIKELY(cache != NULL) {
	assert(cache->cache.type == xnn_cache_type_code);
	xnn_release_code_memory(&cache->cache.code);
	xnn_release_memory(cache->cache.buckets);
	}
	return xnn_status_success;
	}

	enum xnn_status xnn_internal_init_weights_cache(
	struct xnn_weights_cache* cache,
	size_t num_buckets,
	size_t buffer_size)
	{
	memset(cache, 0, sizeof(struct xnn_weights_cache));

	enum xnn_status status = xnn_status_success;
	status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_weights);
	if (status != xnn_status_success) {
	goto error;
	}

	status = xnn_allocate_weights_memory(&cache->cache.weights, buffer_size);
	if (status != xnn_status_success) {
	goto error;
	}

	status = xnn_mutex_init(&cache->mutex);
	if (status != xnn_status_success) {
	goto error;
	}

	return xnn_status_success;

	error:
	xnn_release_weights_cache(cache);
	return status;
	}

	enum xnn_status xnn_init_weights_cache_with_size(struct xnn_weights_cache* cache, size_t size)
	{
	return xnn_internal_init_weights_cache(cache, XNN_CACHE_INITIAL_BUCKETS, size);
	}

	enum xnn_status xnn_init_weights_cache(struct xnn_weights_cache* cache)
	{
	return xnn_init_weights_cache_with_size(cache, XNN_DEFAULT_WEIGHTS_BUFFER_SIZE);
	}

	enum xnn_status xnn_finalize_weights_cache(
	struct xnn_weights_cache* cache,
	enum xnn_weights_cache_finalization_kind finalization_kind)
	{
	switch (cache->finalization_state) {
	case xnn_cache_state_hard_finalized:
	case xnn_cache_state_soft_finalized:
	xnn_log_error("failed to finalize an already final weights cache");
	return xnn_status_invalid_state;
	case xnn_cache_state_not_finalized: {
	enum xnn_status status;
	enum xnn_cache_state finalized_state;

	if (finalization_kind == xnn_weights_cache_finalization_kind_hard) {
	xnn_log_debug("hard finalizing weights cache");
	status = xnn_finalize_weights_memory(&cache->cache.weights);
	// Also release the memory used by hash table (but not the weights memory).
	xnn_release_memory(cache->cache.buckets);
	cache->cache.buckets = NULL;
	finalized_state = xnn_cache_state_hard_finalized;
	} else {
	xnn_log_debug("soft finalizing weights cache");
	assert(finalization_kind == xnn_weights_cache_finalization_kind_soft);
	// Finalize weights cache by reserving sufficient space for the insertion of the largest cached weights. This
	// ensures that we have space to write packed weights to check for cache hits without growing and moving the
	// memory. This has some memory overhead, which can be as large as the size of the largest cached weights,
	// rounded up to page size.
	status = xnn_reserve_weights_memory(&cache->cache.weights, cache->max_weights_size);
	finalized_state = xnn_cache_state_soft_finalized;
	}
	if (status != xnn_status_success) {
	xnn_log_error("failed to finalize weights cache memory");
	return xnn_status_invalid_state;
	}

	cache->finalization_state = finalized_state;
	return xnn_status_success;
	}
	}
	}

	enum xnn_status xnn_release_weights_cache(struct xnn_weights_cache* cache)
	{
	if XNN_LIKELY(cache != NULL) {
	assert(cache->cache.type == xnn_cache_type_weights);
	xnn_release_weights_memory(&cache->cache.weights);
	if (cache->cache.buckets != NULL) {
	xnn_release_memory(cache->cache.buckets);
	}
	const enum xnn_status status = xnn_mutex_destroy(&cache->mutex);
	if (status != xnn_status_success) {
	return status;
	}
	}
	return xnn_status_success;
	}

	static inline bool cache_has_space(struct xnn_weights_cache* cache, size_t n)
	{
	const struct xnn_weights_buffer buf = cache->cache.weights;
	return buf.size + n <= buf.capacity;
	}

	void* xnn_reserve_space_in_weights_cache(struct xnn_weights_cache* cache, size_t n) {
	switch (cache->finalization_state) {
	case xnn_cache_state_hard_finalized:
	xnn_log_error("cannot reserve additional space in a finalized compact weights cache");
	return NULL;
	case xnn_cache_state_soft_finalized:
	if (!cache_has_space(cache, n)) {
	xnn_log_error("cannot reserve additional space in a finalized weights cache");
	return NULL;
	}
	// If the cache is finalized, and has space for `n` bytes, we still want to lock the mutex, because we can have
	// multiple writers attempting to write to this space.
	break;
	default:
	break;
	}

	enum xnn_status status = xnn_mutex_lock(&cache->mutex);
	if (status != xnn_status_success) {
	return NULL;
	}

	struct xnn_weights_buffer* buffer = &cache->cache.weights;
	status = xnn_reserve_weights_memory(buffer, n);
	if (status != xnn_status_success) {
	xnn_mutex_unlock(&cache->mutex);
	return NULL;
	}

	return (void*) ((uintptr_t) buffer->start + buffer->size);
	}

	size_t xnn_get_or_insert_weights_cache(struct xnn_weights_cache* cache, void* ptr, size_t size)
	{
	size_t offset = XNN_CACHE_NOT_FOUND;

	switch (cache->finalization_state) {
	case xnn_cache_state_hard_finalized: {
	xnn_log_error("cannot insert into a finalized compact weights cache");
	return XNN_CACHE_NOT_FOUND;
	}
	case xnn_cache_state_soft_finalized: {
	// Inserting into a finalized weights cache is okay as long as:
	// 1. there is sufficient space in the memory (to write the incoming packed weights), or
	// 2. incoming packed weights is already in cache
	if (!cache_has_space(cache, size)) {
	xnn_log_error("insufficient extra space in finalized weights cache buffer");
	return XNN_CACHE_NOT_FOUND;
	}

	// We need to release the mutex from this point onwards, because xnn_reserve_space_in_weights would have returned
	// non-NULL (which means that it locked the mutex).
	const size_t found_offset = lookup_cache(&cache->cache, ptr, size);
	if (found_offset == XNN_CACHE_NOT_FOUND) {
	xnn_log_error("packed weights not found in finalized weights cache");
	}

	offset = found_offset;
	break;
	}
	case xnn_cache_state_not_finalized: {
	offset = xnn_get_or_insert_cache(&cache->cache, ptr, size);
	if (offset != XNN_CACHE_NOT_FOUND) {
	// Found or inserted packed weights, update the largest size seen so far, this will be used when finalizing the
	// weights cache, to ensure there is an extra space at the end for future cache checks.
	cache->max_weights_size = max(size, cache->max_weights_size);
	}
	break;
	}
	}

	// Mutex is locked in xnn_reserve_space_in_weights_cache when it returns non-NULL, i.e. when cache is not finalized,
	// or if it is xnn_cache_state_soft_finalized and has sufficient space.
	const enum xnn_status status = xnn_mutex_unlock(&cache->mutex);
	(void) status;
	assert(status == xnn_status_success);
	return offset;
	}

	bool xnn_weights_cache_is_finalized(struct xnn_weights_cache* cache) {
	return cache->finalization_state != xnn_cache_state_not_finalized;
	}