blob: 42d9079c63d3e812c3a052c9f6e761005f05fae2 [file] [log] [blame] [edit]
//! A speedy, non-cryptographic hashing algorithm used by `rustc`.
//!
//! # Example
//!
//! ```rust
//! # #[cfg(feature = "std")]
//! # fn main() {
//! use rustc_hash::FxHashMap;
//!
//! let mut map: FxHashMap<u32, u32> = FxHashMap::default();
//! map.insert(22, 44);
//! # }
//! # #[cfg(not(feature = "std"))]
//! # fn main() { }
//! ```
#![no_std]
#![cfg_attr(feature = "nightly", feature(hasher_prefixfree_extras))]
#[cfg(feature = "std")]
extern crate std;
#[cfg(feature = "rand")]
extern crate rand;
#[cfg(feature = "rand")]
mod random_state;
mod seeded_state;
use core::default::Default;
use core::hash::{BuildHasher, Hasher};
#[cfg(feature = "std")]
use std::collections::{HashMap, HashSet};
/// Type alias for a hash map that uses the Fx hashing algorithm.
#[cfg(feature = "std")]
pub type FxHashMap<K, V> = HashMap<K, V, FxBuildHasher>;
/// Type alias for a hash set that uses the Fx hashing algorithm.
#[cfg(feature = "std")]
pub type FxHashSet<V> = HashSet<V, FxBuildHasher>;
#[cfg(feature = "rand")]
pub use random_state::{FxHashMapRand, FxHashSetRand, FxRandomState};
pub use seeded_state::FxSeededState;
#[cfg(feature = "std")]
pub use seeded_state::{FxHashMapSeed, FxHashSetSeed};
/// A speedy hash algorithm for use within rustc. The hashmap in liballoc
/// by default uses SipHash which isn't quite as speedy as we want. In the
/// compiler we're not really worried about DOS attempts, so we use a fast
/// non-cryptographic hash.
///
/// The current implementation is a fast polynomial hash with a single
/// bit rotation as a finishing step designed by Orson Peters.
#[derive(Clone)]
pub struct FxHasher {
hash: usize,
}
// One might view a polynomial hash
// m[0] * k + m[1] * k^2 + m[2] * k^3 + ...
// as a multilinear hash with keystream k[..]
// m[0] * k[0] + m[1] * k[1] + m[2] * k[2] + ...
// where keystream k just happens to be generated using a multiplicative
// congrential pseudorandom number generator (MCG). For that reason we chose a
// constant that was found to be good for a MCG in:
// "Computationally Easy, Spectrally Good Multipliers for Congruential
// Pseudorandom Number Generators" by Guy Steele and Sebastiano Vigna.
#[cfg(target_pointer_width = "64")]
const K: usize = 0xf1357aea2e62a9c5;
#[cfg(target_pointer_width = "32")]
const K: usize = 0x93d765dd;
impl FxHasher {
/// Creates a `fx` hasher with a given seed.
pub const fn with_seed(seed: usize) -> FxHasher {
FxHasher { hash: seed }
}
/// Creates a default `fx` hasher.
pub const fn default() -> FxHasher {
FxHasher { hash: 0 }
}
}
impl Default for FxHasher {
#[inline]
fn default() -> FxHasher {
Self::default()
}
}
impl FxHasher {
#[inline]
fn add_to_hash(&mut self, i: usize) {
self.hash = self.hash.wrapping_add(i).wrapping_mul(K);
}
}
impl Hasher for FxHasher {
#[inline]
fn write(&mut self, bytes: &[u8]) {
// Compress the byte string to a single u64 and add to our hash.
self.write_u64(hash_bytes(bytes));
}
#[inline]
fn write_u8(&mut self, i: u8) {
self.add_to_hash(i as usize);
}
#[inline]
fn write_u16(&mut self, i: u16) {
self.add_to_hash(i as usize);
}
#[inline]
fn write_u32(&mut self, i: u32) {
self.add_to_hash(i as usize);
}
#[inline]
fn write_u64(&mut self, i: u64) {
self.add_to_hash(i as usize);
#[cfg(target_pointer_width = "32")]
self.add_to_hash((i >> 32) as usize);
}
#[inline]
fn write_u128(&mut self, i: u128) {
self.add_to_hash(i as usize);
#[cfg(target_pointer_width = "32")]
self.add_to_hash((i >> 32) as usize);
self.add_to_hash((i >> 64) as usize);
#[cfg(target_pointer_width = "32")]
self.add_to_hash((i >> 96) as usize);
}
#[inline]
fn write_usize(&mut self, i: usize) {
self.add_to_hash(i);
}
#[cfg(feature = "nightly")]
#[inline]
fn write_length_prefix(&mut self, _len: usize) {
// Most cases will specialize hash_slice to call write(), which encodes
// the length already in a more efficient manner than we could here. For
// HashDoS-resistance you would still need to include this for the
// non-slice collection hashes, but for the purposes of rustc we do not
// care and do not wish to pay the performance penalty of mixing in len
// for those collections.
}
#[cfg(feature = "nightly")]
#[inline]
fn write_str(&mut self, s: &str) {
// Similarly here, write already encodes the length, so nothing special
// is needed.
self.write(s.as_bytes())
}
#[inline]
fn finish(&self) -> u64 {
// Since we used a multiplicative hash our top bits have the most
// entropy (with the top bit having the most, decreasing as you go).
// As most hash table implementations (including hashbrown) compute
// the bucket index from the bottom bits we want to move bits from the
// top to the bottom. Ideally we'd rotate left by exactly the hash table
// size, but as we don't know this we'll choose 20 bits, giving decent
// entropy up until 2^20 table sizes. On 32-bit hosts we'll dial it
// back down a bit to 15 bits.
#[cfg(target_pointer_width = "64")]
const ROTATE: u32 = 20;
#[cfg(target_pointer_width = "32")]
const ROTATE: u32 = 15;
self.hash.rotate_left(ROTATE) as u64
// A bit reversal would be even better, except hashbrown also expects
// good entropy in the top 7 bits and a bit reverse would fill those
// bits with low entropy. More importantly, bit reversals are very slow
// on x86-64. A byte reversal is relatively fast, but still has a 2
// cycle latency on x86-64 compared to the 1 cycle latency of a rotate.
// It also suffers from the hashbrown-top-7-bit-issue.
}
}
// Nothing special, digits of pi.
const SEED1: u64 = 0x243f6a8885a308d3;
const SEED2: u64 = 0x13198a2e03707344;
const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0;
#[inline]
fn multiply_mix(x: u64, y: u64) -> u64 {
#[cfg(target_pointer_width = "64")]
{
// We compute the full u64 x u64 -> u128 product, this is a single mul
// instruction on x86-64, one mul plus one mulhi on ARM64.
let full = (x as u128) * (y as u128);
let lo = full as u64;
let hi = (full >> 64) as u64;
// The middle bits of the full product fluctuate the most with small
// changes in the input. This is the top bits of lo and the bottom bits
// of hi. We can thus make the entire output fluctuate with small
// changes to the input by XOR'ing these two halves.
lo ^ hi
// Unfortunately both 2^64 + 1 and 2^64 - 1 have small prime factors,
// otherwise combining with + or - could result in a really strong hash, as:
// x * y = 2^64 * hi + lo = (-1) * hi + lo = lo - hi, (mod 2^64 + 1)
// x * y = 2^64 * hi + lo = 1 * hi + lo = lo + hi, (mod 2^64 - 1)
// Multiplicative hashing is universal in a field (like mod p).
}
#[cfg(target_pointer_width = "32")]
{
// u64 x u64 -> u128 product is prohibitively expensive on 32-bit.
// Decompose into 32-bit parts.
let lx = x as u32;
let ly = y as u32;
let hx = (x >> 32) as u32;
let hy = (y >> 32) as u32;
// u32 x u32 -> u64 the low bits of one with the high bits of the other.
let afull = (lx as u64) * (hy as u64);
let bfull = (hx as u64) * (ly as u64);
// Combine, swapping low/high of one of them so the upper bits of the
// product of one combine with the lower bits of the other.
afull ^ bfull.rotate_right(32)
}
}
/// A wyhash-inspired non-collision-resistant hash for strings/slices designed
/// by Orson Peters, with a focus on small strings and small codesize.
///
/// The 64-bit version of this hash passes the SMHasher3 test suite on the full
/// 64-bit output, that is, f(hash_bytes(b) ^ f(seed)) for some good avalanching
/// permutation f() passed all tests with zero failures. When using the 32-bit
/// version of multiply_mix this hash has a few non-catastrophic failures where
/// there are a handful more collisions than an optimal hash would give.
///
/// We don't bother avalanching here as we'll feed this hash into a
/// multiplication after which we take the high bits, which avalanches for us.
#[inline]
fn hash_bytes(bytes: &[u8]) -> u64 {
let len = bytes.len();
let mut s0 = SEED1;
let mut s1 = SEED2;
if len <= 16 {
// XOR the input into s0, s1.
if len >= 8 {
s0 ^= u64::from_le_bytes(bytes[0..8].try_into().unwrap());
s1 ^= u64::from_le_bytes(bytes[len - 8..].try_into().unwrap());
} else if len >= 4 {
s0 ^= u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as u64;
s1 ^= u32::from_le_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
} else if len > 0 {
let lo = bytes[0];
let mid = bytes[len / 2];
let hi = bytes[len - 1];
s0 ^= lo as u64;
s1 ^= ((hi as u64) << 8) | mid as u64;
}
} else {
// Handle bulk (can partially overlap with suffix).
let mut off = 0;
while off < len - 16 {
let x = u64::from_le_bytes(bytes[off..off + 8].try_into().unwrap());
let y = u64::from_le_bytes(bytes[off + 8..off + 16].try_into().unwrap());
// Replace s1 with a mix of s0, x, and y, and s0 with s1.
// This ensures the compiler can unroll this loop into two
// independent streams, one operating on s0, the other on s1.
//
// Since zeroes are a common input we prevent an immediate trivial
// collapse of the hash function by XOR'ing a constant with y.
let t = multiply_mix(s0 ^ x, PREVENT_TRIVIAL_ZERO_COLLAPSE ^ y);
s0 = s1;
s1 = t;
off += 16;
}
let suffix = &bytes[len - 16..];
s0 ^= u64::from_le_bytes(suffix[0..8].try_into().unwrap());
s1 ^= u64::from_le_bytes(suffix[8..16].try_into().unwrap());
}
multiply_mix(s0, s1) ^ (len as u64)
}
/// An implementation of [`BuildHasher`] that produces [`FxHasher`]s.
///
/// ```
/// use std::hash::BuildHasher;
/// use rustc_hash::FxBuildHasher;
/// assert_ne!(FxBuildHasher.hash_one(1), FxBuildHasher.hash_one(2));
/// ```
#[derive(Copy, Clone, Default)]
pub struct FxBuildHasher;
impl BuildHasher for FxBuildHasher {
type Hasher = FxHasher;
fn build_hasher(&self) -> FxHasher {
FxHasher::default()
}
}
#[cfg(test)]
mod tests {
#[cfg(not(any(target_pointer_width = "64", target_pointer_width = "32")))]
compile_error!("The test suite only supports 64 bit and 32 bit usize");
use crate::{FxBuildHasher, FxHasher};
use core::hash::{BuildHasher, Hash, Hasher};
macro_rules! test_hash {
(
$(
hash($value:expr) == $result:expr,
)*
) => {
$(
assert_eq!(FxBuildHasher.hash_one($value), $result);
)*
};
}
const B32: bool = cfg!(target_pointer_width = "32");
#[test]
fn unsigned() {
test_hash! {
hash(0_u8) == 0,
hash(1_u8) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_u8) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(u8::MAX) == if B32 { 999399879 } else { 17600987023830959190 },
hash(0_u16) == 0,
hash(1_u16) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_u16) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(u16::MAX) == if B32 { 3440503042 } else { 4001367065645062987 },
hash(0_u32) == 0,
hash(1_u32) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_u32) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(u32::MAX) == if B32 { 1293006356 } else { 17126373362251322066 },
hash(0_u64) == 0,
hash(1_u64) == if B32 { 275023839 } else { 12583873379513078615 },
hash(100_u64) == if B32 { 1732383522 } else { 4008740938959785536 },
hash(u64::MAX) == if B32 { 1017982517 } else { 5862870694197521576 },
hash(0_u128) == 0,
hash(1_u128) == if B32 { 1860738631 } else { 12885773367358079611 },
hash(100_u128) == if B32 { 1389515751 } else { 15751995649841559633 },
hash(u128::MAX) == if B32 { 2156022013 } else { 11423841400550042156 },
hash(0_usize) == 0,
hash(1_usize) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_usize) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(usize::MAX) == if B32 { 1293006356 } else { 5862870694197521576 },
}
}
#[test]
fn signed() {
test_hash! {
hash(i8::MIN) == if B32 { 2000713177 } else { 5869058164817243095 },
hash(0_i8) == 0,
hash(1_i8) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_i8) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(i8::MAX) == if B32 { 3293686765 } else { 11731928859014764671 },
hash(i16::MIN) == if B32 { 1073764727 } else { 8292620222579070801 },
hash(0_i16) == 0,
hash(1_i16) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_i16) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(i16::MAX) == if B32 { 2366738315 } else { 14155490916776592377 },
hash(i32::MIN) == if B32 { 16384 } else { 5631751334026900245 },
hash(0_i32) == 0,
hash(1_i32) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_i32) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(i32::MAX) == if B32 { 1293022740 } else { 11494622028224421821 },
hash(i64::MIN) == if B32 { 16384 } else { 524288 },
hash(0_i64) == 0,
hash(1_i64) == if B32 { 275023839 } else { 12583873379513078615 },
hash(100_i64) == if B32 { 1732383522 } else { 4008740938959785536 },
hash(i64::MAX) == if B32 { 1017998901 } else { 5862870694198045864 },
hash(i128::MIN) == if B32 { 16384 } else { 524288 },
hash(0_i128) == 0,
hash(1_i128) == if B32 { 1860738631 } else { 12885773367358079611 },
hash(100_i128) == if B32 { 1389515751 } else { 15751995649841559633 },
hash(i128::MAX) == if B32 { 2156005629 } else { 11423841400549517868 },
hash(isize::MIN) == if B32 { 16384 } else { 524288 },
hash(0_isize) == 0,
hash(1_isize) == if B32 { 3001993707 } else { 12583873379513078615 },
hash(100_isize) == if B32 { 3844759569 } else { 4008740938959785536 },
hash(isize::MAX) == if B32 { 1293022740 } else { 5862870694198045864 },
}
}
// Avoid relying on any `Hash` implementations in the standard library.
struct HashBytes(&'static [u8]);
impl Hash for HashBytes {
fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
state.write(self.0);
}
}
#[test]
fn bytes() {
test_hash! {
hash(HashBytes(&[])) == if B32 { 2673204745 } else { 5175017818631658678 },
hash(HashBytes(&[0])) == if B32 { 2948228584 } else { 11037888512829180254 },
hash(HashBytes(&[0, 0, 0, 0, 0, 0])) == if B32 { 3223252423 } else { 6891281800865632452 },
hash(HashBytes(&[1])) == if B32 { 2943445104 } else { 4127763515449136980 },
hash(HashBytes(&[2])) == if B32 { 1055423297 } else { 11322700005987241762 },
hash(HashBytes(b"uwu")) == if B32 { 2699662140 } else { 2129615206728903013 },
hash(HashBytes(b"These are some bytes for testing rustc_hash.")) == if B32 { 2303640537 } else { 5513083560975408889 },
}
}
#[test]
fn with_seed_actually_different() {
let seeds = [
[1, 2],
[42, 17],
[124436707, 99237],
[usize::MIN, usize::MAX],
];
for [a_seed, b_seed] in seeds {
let a = || FxHasher::with_seed(a_seed);
let b = || FxHasher::with_seed(b_seed);
for x in u8::MIN..=u8::MAX {
let mut a = a();
let mut b = b();
x.hash(&mut a);
x.hash(&mut b);
assert_ne!(a.finish(), b.finish())
}
}
}
}