Henri Chataing | 1e09924 | 2022-02-08 11:32:43 +0100 | [diff] [blame] | 1 | /*! |
| 2 | The ucd-trie crate provides a compressed trie set specifically tailored for |
| 3 | Unicode codepoints. The principle use case for such a trie is to represent |
| 4 | properties defined by Unicode that correspond to sets of Unicode codepoints. |
| 5 | (These properties are formally called boolean properties or "single valued" |
| 6 | properties. See |
Jeff Vander Stoep | 8820631 | 2022-12-19 11:12:50 +0100 | [diff] [blame] | 7 | [UTR#23 S3.3](https://www.unicode.org/reports/tr23/#PropertyTypeDefinitions) |
Henri Chataing | 1e09924 | 2022-02-08 11:32:43 +0100 | [diff] [blame] | 8 | for more details.) |
| 9 | |
| 10 | This crate has two principle types: `TrieSetOwned` and `TrieSetSlice`, |
| 11 | corresponding to a similar split as there is between `Vec<T>` and `&[T]`. |
| 12 | `TrieSetOwned` is the only way to construct a trie from a set of Unicode |
| 13 | codepoints. |
| 14 | |
| 15 | The intended use of this library is to embed a static instance of |
| 16 | `TrieSetSlice` into your source code, and then use its methods as defined in |
| 17 | this crate to test membership. (The `ucd-generate` tool can likely generate |
| 18 | this code for you.) |
| 19 | |
| 20 | Finally, while this crate uses the standard library by default, it provides |
| 21 | `no_std` functionality by disabling the `std` feature. When `no_std` is |
| 22 | enabled, then `TrieSetOwned` is not provided. Instead, only `TrieSetSlice` is |
| 23 | provided, which means `no_std` crates can still embed tries into their code. |
| 24 | */ |
| 25 | |
| 26 | #![deny(missing_docs)] |
| 27 | #![cfg_attr(not(feature = "std"), no_std)] |
| 28 | |
| 29 | use core::fmt; |
| 30 | |
| 31 | #[cfg(feature = "std")] |
| 32 | pub use crate::owned::{Error, Result, TrieSetOwned}; |
| 33 | |
| 34 | #[cfg(test)] |
| 35 | #[allow(dead_code)] |
| 36 | mod general_category; |
| 37 | #[cfg(feature = "std")] |
| 38 | mod owned; |
| 39 | |
| 40 | const CHUNK_SIZE: usize = 64; |
| 41 | |
| 42 | /// A type alias for `TrieSetSlice<'static>`. |
| 43 | pub type TrieSet = TrieSetSlice<'static>; |
| 44 | |
| 45 | /// A borrowed trie set. |
| 46 | #[derive(Clone, Copy)] |
| 47 | pub struct TrieSetSlice<'a> { |
| 48 | /// first tree, one level |
| 49 | #[doc(hidden)] |
| 50 | pub tree1_level1: &'a [u64], |
| 51 | /// second tree, first level |
| 52 | #[doc(hidden)] |
| 53 | pub tree2_level1: &'a [u8], |
| 54 | /// second tree, second level |
| 55 | #[doc(hidden)] |
| 56 | pub tree2_level2: &'a [u64], |
| 57 | /// third tree, first level |
| 58 | #[doc(hidden)] |
| 59 | pub tree3_level1: &'a [u8], |
| 60 | /// third tree, second level |
| 61 | #[doc(hidden)] |
| 62 | pub tree3_level2: &'a [u8], |
| 63 | /// third tree, third level |
| 64 | #[doc(hidden)] |
| 65 | pub tree3_level3: &'a [u64], |
| 66 | } |
| 67 | |
| 68 | impl<'a> fmt::Debug for TrieSetSlice<'a> { |
| 69 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 70 | write!(f, "TrieSetSlice(...)") |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | impl<'a> TrieSetSlice<'a> { |
| 75 | /// Returns true if and only if the given Unicode scalar value is in this |
| 76 | /// set. |
| 77 | pub fn contains_char(&self, c: char) -> bool { |
| 78 | self.contains(c as usize) |
| 79 | } |
| 80 | |
| 81 | /// Returns true if and only if the given codepoint is in this set. |
| 82 | /// |
| 83 | /// If the given value exceeds the codepoint range (i.e., it's greater |
| 84 | /// than `0x10FFFF`), then this returns false. |
| 85 | pub fn contains_u32(&self, cp: u32) -> bool { |
| 86 | if cp > 0x10FFFF { |
| 87 | return false; |
| 88 | } |
| 89 | self.contains(cp as usize) |
| 90 | } |
| 91 | |
| 92 | #[inline(always)] |
| 93 | fn contains(&self, cp: usize) -> bool { |
| 94 | if cp < 0x800 { |
| 95 | self.chunk_contains(cp, self.tree1_level1[cp >> 6]) |
| 96 | } else if cp < 0x10000 { |
| 97 | let leaf = match self.tree2_level1.get((cp >> 6) - 0x20) { |
| 98 | None => return false, |
| 99 | Some(&leaf) => leaf, |
| 100 | }; |
| 101 | self.chunk_contains(cp, self.tree2_level2[leaf as usize]) |
| 102 | } else { |
| 103 | let child = match self.tree3_level1.get((cp >> 12) - 0x10) { |
| 104 | None => return false, |
| 105 | Some(&child) => child, |
| 106 | }; |
| 107 | let i = ((child as usize) * CHUNK_SIZE) + ((cp >> 6) & 0b111111); |
| 108 | let leaf = self.tree3_level2[i]; |
| 109 | self.chunk_contains(cp, self.tree3_level3[leaf as usize]) |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | #[inline(always)] |
| 114 | fn chunk_contains(&self, cp: usize, chunk: u64) -> bool { |
| 115 | ((chunk >> (cp & 0b111111)) & 1) == 1 |
| 116 | } |
| 117 | } |