| // This file is part of ICU4X. For terms of use, please see the file |
| // called LICENSE at the top level of the ICU4X source tree |
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| |
| //! Varint spec for ZeroTrie: |
| //! |
| //! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value |
| //! - Trail bytes: top bit is varint extender; rest are low bits of value |
| //! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte |
| //! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize` |
| //! |
| //! This is best shown by examples. |
| //! |
| //! ```txt |
| //! xxx0'1010 = 10 |
| //! xxx0'1111 = 15 (largest single-byte value with M=3) |
| //! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3) |
| //! xxx1'0000 0000'0001 = 17 |
| //! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3) |
| //! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3) |
| //! xxx1'0000 1000'0000 0000'0001 = 2065 |
| //! ``` |
| //! |
| //! The latent values by number of bytes for M=3 are: |
| //! |
| //! - 1 byte: 0 |
| //! - 2 bytes: 16 = 0x10 = 0b10000 |
| //! - 3 bytes: 2064 = 0x810 = 0b100000010000 |
| //! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000 |
| //! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000 |
| //! - … |
| //! |
| //! For M=2, the latent values are: |
| //! |
| //! - 1 byte: 0 |
| //! - 2 bytes: 32 = 0x20 = 0b100000 |
| //! - 3 bytes: 4128 = 0x1020 = 0b1000000100000 |
| //! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000 |
| //! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000 |
| //! - … |
| |
| use crate::builder::konst::ConstArrayBuilder; |
| |
| #[cfg(feature = "alloc")] |
| use crate::builder::nonconst::TrieBuilderStore; |
| |
| /// Reads a varint with 2 bits of metadata in the lead byte. |
| /// |
| /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. |
| /// |
| /// If the varint spills off the end of the slice, a debug assertion will fail, |
| /// and the function will return the value up to that point. |
| pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) { |
| let mut value = (start & 0b00011111) as usize; |
| let mut remainder = remainder; |
| if (start & 0b00100000) != 0 { |
| loop { |
| let next; |
| (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); |
| // Note: value << 7 could drop high bits. The first addition can't overflow. |
| // The second addition could overflow; in such a case we just inform the |
| // developer via the debug assertion. |
| value = (value << 7) + ((*next & 0b01111111) as usize) + 32; |
| if (*next & 0b10000000) == 0 { |
| break; |
| } |
| } |
| } |
| (value, remainder) |
| } |
| |
| /// Reads a varint with 3 bits of metadata in the lead byte. |
| /// |
| /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. |
| /// |
| /// If the varint spills off the end of the slice, a debug assertion will fail, |
| /// and the function will return the value up to that point. |
| pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) { |
| let mut value = (start & 0b00001111) as usize; |
| let mut remainder = remainder; |
| if (start & 0b00010000) != 0 { |
| loop { |
| let next; |
| (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); |
| // Note: value << 7 could drop high bits. The first addition can't overflow. |
| // The second addition could overflow; in such a case we just inform the |
| // developer via the debug assertion. |
| value = (value << 7) + ((*next & 0b01111111) as usize) + 16; |
| if (*next & 0b10000000) == 0 { |
| break; |
| } |
| } |
| } |
| (value, remainder) |
| } |
| |
| /// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`]. |
| /// |
| /// Returns the varint value. |
| #[cfg(feature = "alloc")] |
| pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>( |
| start: u8, |
| remainder: &mut S, |
| ) -> Option<usize> { |
| let mut value = (start & 0b00001111) as usize; |
| if (start & 0b00010000) != 0 { |
| loop { |
| let next = remainder.atbs_pop_front()?; |
| // Note: value << 7 could drop high bits. The first addition can't overflow. |
| // The second addition could overflow; in such a case we just inform the |
| // developer via the debug assertion. |
| value = (value << 7) + ((next & 0b01111111) as usize) + 16; |
| if (next & 0b10000000) == 0 { |
| break; |
| } |
| } |
| } |
| Some(value) |
| } |
| |
| #[cfg(test)] |
| const MAX_VARINT: usize = usize::MAX; |
| |
| // *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value. |
| // Add an extra 1 since the lead byte holds only 5 bits of data. |
| const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7; |
| |
| /// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata. |
| pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
| let mut result = [0; MAX_VARINT_LENGTH]; |
| let mut i = MAX_VARINT_LENGTH - 1; |
| let mut value = value; |
| let mut last = true; |
| loop { |
| if value < 32 { |
| result[i] = value as u8; |
| if !last { |
| result[i] |= 0b00100000; |
| } |
| break; |
| } |
| value -= 32; |
| result[i] = (value as u8) & 0b01111111; |
| if !last { |
| result[i] |= 0b10000000; |
| } else { |
| last = false; |
| } |
| value >>= 7; |
| i -= 1; |
| } |
| // The bytes are from i to the end. |
| ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) |
| } |
| |
| /// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata. |
| pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
| let mut result = [0; MAX_VARINT_LENGTH]; |
| let mut i = MAX_VARINT_LENGTH - 1; |
| let mut value = value; |
| let mut last = true; |
| loop { |
| if value < 16 { |
| result[i] = value as u8; |
| if !last { |
| result[i] |= 0b00010000; |
| } |
| break; |
| } |
| value -= 16; |
| result[i] = (value as u8) & 0b01111111; |
| if !last { |
| result[i] |= 0b10000000; |
| } else { |
| last = false; |
| } |
| value >>= 7; |
| i -= 1; |
| } |
| // The bytes are from i to the end. |
| ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) |
| } |
| |
| /// A secondary implementation that separates the latent value while computing the varint. |
| #[cfg(test)] |
| pub(crate) const fn write_varint_reference( |
| value: usize, |
| ) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> { |
| let mut result = [0; MAX_VARINT_LENGTH]; |
| if value < 32 { |
| result[0] = value as u8; |
| return ConstArrayBuilder::from_manual_slice(result, 0, 1); |
| } |
| result[0] = 32; |
| let mut latent = 32; |
| let mut steps = 2; |
| loop { |
| let next_latent = (latent << 7) + 32; |
| if value < next_latent || next_latent == latent { |
| break; |
| } |
| latent = next_latent; |
| steps += 1; |
| } |
| let mut value = value - latent; |
| let mut i = steps; |
| while i > 0 { |
| i -= 1; |
| result[i] |= (value as u8) & 0b01111111; |
| value >>= 7; |
| if i > 0 && i < steps - 1 { |
| result[i] |= 0b10000000; |
| } |
| } |
| // The bytes are from 0 to `steps`. |
| ConstArrayBuilder::from_manual_slice(result, 0, steps) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[derive(Debug)] |
| struct TestCase<'a> { |
| bytes: &'a [u8], |
| remainder: &'a [u8], |
| value: usize, |
| } |
| static CASES: &[TestCase] = &[ |
| TestCase { |
| bytes: &[0b00000000], |
| remainder: &[], |
| value: 0, |
| }, |
| TestCase { |
| bytes: &[0b00001010], |
| remainder: &[], |
| value: 10, |
| }, |
| TestCase { |
| bytes: &[0b00011111], |
| remainder: &[], |
| value: 31, |
| }, |
| TestCase { |
| bytes: &[0b00011111, 0b10101010], |
| remainder: &[0b10101010], |
| value: 31, |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b00000000], |
| remainder: &[], |
| value: 32, |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b00000001], |
| remainder: &[], |
| value: 33, |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b00100000], |
| remainder: &[], |
| value: 64, |
| }, |
| TestCase { |
| bytes: &[0x20, 0x44], |
| remainder: &[], |
| value: 100, |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b01111111], |
| remainder: &[], |
| value: 159, |
| }, |
| TestCase { |
| bytes: &[0b00100001, 0b00000000], |
| remainder: &[], |
| value: 160, |
| }, |
| TestCase { |
| bytes: &[0b00100001, 0b00000001], |
| remainder: &[], |
| value: 161, |
| }, |
| TestCase { |
| bytes: &[0x23, 0x54], |
| remainder: &[], |
| value: 500, |
| }, |
| TestCase { |
| bytes: &[0b00111111, 0b01111111], |
| remainder: &[], |
| value: 4127, // 32 + (1 << 12) - 1 |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000000, 0b00000000], |
| remainder: &[], |
| value: 4128, // 32 + (1 << 12) |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000000, 0b00000001], |
| remainder: &[], |
| value: 4129, // 32 + (1 << 12) + 1 |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000000, 0b01111111], |
| remainder: &[], |
| value: 4255, // 32 + (1 << 12) + 127 |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000001, 0b00000000], |
| remainder: &[], |
| value: 4256, // 32 + (1 << 12) + 128 |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000001, 0b00000001], |
| remainder: &[], |
| value: 4257, // 32 + (1 << 12) + 129 |
| }, |
| TestCase { |
| bytes: &[0x20, 0x86, 0x68], |
| remainder: &[], |
| value: 5000, |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b11111111, 0b01111111], |
| remainder: &[], |
| value: 20511, // 32 + (1 << 12) + (1 << 14) - 1 |
| }, |
| TestCase { |
| bytes: &[0b00100001, 0b10000000, 0b00000000], |
| remainder: &[], |
| value: 20512, // 32 + (1 << 12) + (1 << 14) |
| }, |
| TestCase { |
| bytes: &[0b00111111, 0b11111111, 0b01111111], |
| remainder: &[], |
| value: 528415, // 32 + (1 << 12) + (1 << 19) - 1 |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000], |
| remainder: &[], |
| value: 528416, // 32 + (1 << 12) + (1 << 19) |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001], |
| remainder: &[], |
| value: 528417, // 32 + (1 << 12) + (1 << 19) + 1 |
| }, |
| TestCase { |
| bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111], |
| remainder: &[], |
| value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1 |
| }, |
| TestCase { |
| bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000], |
| remainder: &[], |
| value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26) |
| }, |
| ]; |
| |
| #[test] |
| fn test_read() { |
| for cas in CASES { |
| let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); |
| assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); |
| } |
| } |
| |
| #[test] |
| fn test_read_write() { |
| for cas in CASES { |
| let reference_bytes = write_varint_reference(cas.value); |
| assert_eq!( |
| reference_bytes.len(), |
| cas.bytes.len() - cas.remainder.len(), |
| "{:?}", |
| cas |
| ); |
| assert_eq!( |
| reference_bytes.as_slice(), |
| &cas.bytes[0..reference_bytes.len()], |
| "{:?}", |
| cas |
| ); |
| let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); |
| assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); |
| let write_bytes = write_varint_meta2(cas.value); |
| assert_eq!( |
| reference_bytes.as_slice(), |
| write_bytes.as_slice(), |
| "{:?}", |
| cas |
| ); |
| } |
| } |
| |
| #[test] |
| fn test_roundtrip() { |
| let mut i = 0usize; |
| while i < MAX_VARINT { |
| let bytes = write_varint_meta2(i); |
| let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]); |
| assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); |
| i <<= 1; |
| i += 1; |
| } |
| } |
| |
| #[test] |
| fn test_extended_roundtrip() { |
| let mut i = 0usize; |
| while i < MAX_VARINT { |
| let bytes = write_varint_meta3(i); |
| let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]); |
| assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); |
| i <<= 1; |
| i += 1; |
| } |
| } |
| |
| #[test] |
| fn test_max() { |
| let reference_bytes = write_varint_reference(MAX_VARINT); |
| let write_bytes = write_varint_meta2(MAX_VARINT); |
| assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH); |
| assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice()); |
| let subarray = write_bytes |
| .as_const_slice() |
| .get_subslice_or_panic(1, write_bytes.len()); |
| let (recovered_value, remainder) = read_varint_meta2( |
| *write_bytes.as_const_slice().first().unwrap(), |
| subarray.as_slice(), |
| ); |
| assert!(remainder.is_empty()); |
| assert_eq!(recovered_value, MAX_VARINT); |
| assert_eq!( |
| write_bytes.as_slice(), |
| &[ |
| 0b00100001, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b11011111, // |
| 0b01011111, // |
| ] |
| ); |
| } |
| |
| #[test] |
| fn text_extended_max() { |
| let write_bytes = write_varint_meta3(MAX_VARINT); |
| assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH); |
| let (lead, trailing) = write_bytes.as_slice().split_first().unwrap(); |
| let (recovered_value, remainder) = read_varint_meta3(*lead, trailing); |
| assert!(remainder.is_empty()); |
| assert_eq!(recovered_value, MAX_VARINT); |
| assert_eq!( |
| write_bytes.as_slice(), |
| &[ |
| 0b00010001, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b11101111, // |
| 0b01101111, // |
| ] |
| ); |
| } |
| |
| #[test] |
| fn test_latent_values() { |
| // Same values documented in the module docs: M=2 |
| let m2 = read_varint_meta2; |
| assert_eq!(m2(0, &[]).0, 0); |
| assert_eq!(m2(0x20, &[0x00]).0, 32); |
| assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128); |
| assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416); |
| assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280); |
| |
| // Same values documented in the module docs: M=3 |
| let m3 = read_varint_meta3; |
| assert_eq!(m3(0, &[]).0, 0); |
| assert_eq!(m3(0x10, &[0x00]).0, 16); |
| assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064); |
| assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208); |
| assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640); |
| } |
| } |