| // This file is part of ICU4X. For terms of use, please see the file |
| // called LICENSE at the top level of the ICU4X source tree |
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| |
| use alloc::vec; |
| use alloc::vec::Vec; |
| use core::hash::{Hash, Hasher}; |
| use twox_hash::XxHash64; |
| |
| // Const seed to be used with [`XxHash64::with_seed`]. |
| const SEED: u64 = 0xaabbccdd; |
| |
| /// Split the 64bit `hash` into (g, f0, f1). |
| /// |
| /// g denotes the highest 16bits of the hash modulo `m`, and is referred to as first level hash. |
| /// (f0, f1) denotes the middle, and lower 24bits of the hash respectively. |
| /// (f0, f1) are used to distribute the keys with same g, into distinct slots. |
| /// |
| /// # Arguments |
| /// |
| /// * `hash` - The hash to split. |
| /// * `m` - The modulo used to split the hash. |
| pub const fn split_hash64(hash: u64, m: usize) -> (usize, u32, u32) { |
| ( |
| ((hash >> 48) as usize % m), |
| ((hash >> 24) as u32 & 0xffffff), |
| ((hash & 0xffffff) as u32), |
| ) |
| } |
| |
| /// Compute hash using [`XxHash64`]. |
| pub fn compute_hash<K: Hash + ?Sized>(key: &K) -> u64 { |
| let mut hasher = XxHash64::with_seed(SEED); |
| key.hash(&mut hasher); |
| hasher.finish() |
| } |
| |
| /// Calculate the index using (f0, f1), (d0, d1) in modulo m. |
| /// Returns [`None`] if d is (0, 0) or modulo is 0 |
| /// else returns the index computed using (f0 + f1 * d0 + d1) mod m. |
| pub fn compute_index(f: (u32, u32), d: (u32, u32), m: u32) -> Option<usize> { |
| if d == (0, 0) || m == 0 { |
| None |
| } else { |
| Some((f.1.wrapping_mul(d.0).wrapping_add(f.0).wrapping_add(d.1) % m) as usize) |
| } |
| } |
| |
| /// Compute displacements for the given `key_hashes`, which split the keys into distinct slots by a |
| /// two-level hashing schema. |
| /// |
| /// Returns a tuple of where the first item is the displacement array and the second item is the |
| /// reverse mapping used to permute keys, values into their slots. |
| /// |
| /// 1. Split the hashes into (g, f0, f1). |
| /// 2. Bucket and sort the split hash on g in descending order. |
| /// 3. In decreasing order of bucket size, try until a (d0, d1) is found that splits the keys |
| /// in the bucket into distinct slots. |
| /// 4. Mark the slots for current bucket as occupied and store the reverse mapping. |
| /// 5. Repeat untill all the keys have been assigned distinct slots. |
| /// |
| /// # Arguments |
| /// |
| /// * `key_hashes` - [`ExactSizeIterator`] over the hashed key values |
| #[allow(clippy::indexing_slicing, clippy::unwrap_used)] |
| pub fn compute_displacements( |
| key_hashes: impl ExactSizeIterator<Item = u64>, |
| ) -> (Vec<(u32, u32)>, Vec<usize>) { |
| let len = key_hashes.len(); |
| |
| // A vector to track the size of buckets for sorting. |
| let mut bucket_sizes = vec![0; len]; |
| |
| // A flattened representation of items in the buckets after applying first level hash function |
| let mut bucket_flatten = Vec::with_capacity(len); |
| |
| // Compute initial displacement and bucket sizes |
| |
| key_hashes.into_iter().enumerate().for_each(|(i, kh)| { |
| let h = split_hash64(kh, len); |
| bucket_sizes[h.0] += 1; |
| bucket_flatten.push((h, i)) |
| }); |
| |
| // Sort by decreasing order of bucket_sizes. |
| bucket_flatten.sort_by(|&(ha, _), &(hb, _)| { |
| // ha.0, hb.0 are always within bounds of `bucket_sizes` |
| (bucket_sizes[hb.0], hb).cmp(&(bucket_sizes[ha.0], ha)) |
| }); |
| |
| // Generation count while iterating buckets. |
| // Each trial of ((d0, d1), bucket chain) is a new generation. |
| // We use this to track which all slots are assigned for the current bucket chain. |
| let mut generation = 0; |
| |
| // Whether a slot has been occupied by previous buckets with a different first level hash (different |
| // bucket chain). |
| let mut occupied = vec![false; len]; |
| |
| // Track generation count for the slots. |
| // A slot is empty if either it is unoccupied by the previous bucket chains and the |
| // assignment is not equal to generation. |
| let mut assignments = vec![0; len]; |
| |
| // Vec to store the displacements (saves us a recomputation of hash while assigning slots). |
| let mut current_displacements = Vec::with_capacity(16); |
| |
| // (d0, d1) which splits the bucket into different slots |
| let mut displacements = vec![(0, 0); len]; |
| |
| // Vec to store mapping to the original order of keys. |
| // This is a permutation which will be applied to keys, values at the end. |
| let mut reverse_mapping = vec![0; len]; |
| |
| let mut start = 0; |
| while start < len { |
| // Bucket span with the same first level hash |
| // start is always within bounds of `bucket_flatten` |
| let g = bucket_flatten[start].0 .0; |
| // g is always within bounds of `bucket_sizes` |
| let end = start + bucket_sizes[g]; |
| // start, end - 1 are always within bounds of `bucket_sizes` |
| let buckets = &bucket_flatten[start..end]; |
| |
| 'd0: for d0 in 0..len as u32 { |
| 'd1: for d1 in 0..len as u32 { |
| if (d0, d1) == (0, 0) { |
| continue; |
| } |
| current_displacements.clear(); |
| generation += 1; |
| |
| for ((_, f0, f1), _) in buckets { |
| let displacement_idx = compute_index((*f0, *f1), (d0, d1), len as u32).unwrap(); |
| |
| // displacement_idx is always within bounds |
| if occupied[displacement_idx] || assignments[displacement_idx] == generation { |
| continue 'd1; |
| } |
| assignments[displacement_idx] = generation; |
| current_displacements.push(displacement_idx); |
| } |
| |
| // Successfully found a (d0, d1), store it as index g. |
| // g < displacements.len() due to modulo operation |
| displacements[g] = (d0, d1); |
| |
| for (i, displacement_idx) in current_displacements.iter().enumerate() { |
| // `current_displacements` has same size as `buckets` |
| let (_, idx) = &buckets[i]; |
| |
| // displacement_idx is always within bounds |
| occupied[*displacement_idx] = true; |
| reverse_mapping[*displacement_idx] = *idx; |
| } |
| break 'd0; |
| } |
| } |
| |
| start = end; |
| } |
| |
| (displacements, reverse_mapping) |
| } |