| use alloc::{ |
| string::{String, ToString}, |
| vec::Vec, |
| }; |
| |
| use crate::hir; |
| |
| /// An inclusive range of codepoints from a generated file (hence the static |
| /// lifetime). |
| type Range = &'static [(char, char)]; |
| |
| /// An error that occurs when dealing with Unicode. |
| /// |
| /// We don't impl the Error trait here because these always get converted |
| /// into other public errors. (This error type isn't exported.) |
| #[derive(Debug)] |
| pub enum Error { |
| PropertyNotFound, |
| PropertyValueNotFound, |
| // Not used when unicode-perl is enabled. |
| #[allow(dead_code)] |
| PerlClassNotFound, |
| } |
| |
| /// An error that occurs when Unicode-aware simple case folding fails. |
| /// |
| /// This error can occur when the case mapping tables necessary for Unicode |
| /// aware case folding are unavailable. This only occurs when the |
| /// `unicode-case` feature is disabled. (The feature is enabled by default.) |
| #[derive(Debug)] |
| pub struct CaseFoldError(()); |
| |
| #[cfg(feature = "std")] |
| impl std::error::Error for CaseFoldError {} |
| |
| impl core::fmt::Display for CaseFoldError { |
| fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| write!( |
| f, |
| "Unicode-aware case folding is not available \ |
| (probably because the unicode-case feature is not enabled)" |
| ) |
| } |
| } |
| |
| /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
| /// |
| /// This error can occur when the data tables necessary for the Unicode aware |
| /// Perl character class `\w` are unavailable. This only occurs when the |
| /// `unicode-perl` feature is disabled. (The feature is enabled by default.) |
| #[derive(Debug)] |
| pub struct UnicodeWordError(()); |
| |
| #[cfg(feature = "std")] |
| impl std::error::Error for UnicodeWordError {} |
| |
| impl core::fmt::Display for UnicodeWordError { |
| fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| write!( |
| f, |
| "Unicode-aware \\w class is not available \ |
| (probably because the unicode-perl feature is not enabled)" |
| ) |
| } |
| } |
| |
| /// A state oriented traverser of the simple case folding table. |
| /// |
| /// A case folder can be constructed via `SimpleCaseFolder::new()`, which will |
| /// return an error if the underlying case folding table is unavailable. |
| /// |
| /// After construction, it is expected that callers will use |
| /// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly |
| /// increasing order. For example, calling it on `b` and then on `a` is illegal |
| /// and will result in a panic. |
| /// |
| /// The main idea of this type is that it tries hard to make mapping lookups |
| /// fast by exploiting the structure of the underlying table, and the ordering |
| /// assumption enables this. |
| #[derive(Debug)] |
| pub struct SimpleCaseFolder { |
| /// The simple case fold table. It's a sorted association list, where the |
| /// keys are Unicode scalar values and the values are the corresponding |
| /// equivalence class (not including the key) of the "simple" case folded |
| /// Unicode scalar values. |
| table: &'static [(char, &'static [char])], |
| /// The last codepoint that was used for a lookup. |
| last: Option<char>, |
| /// The index to the entry in `table` corresponding to the smallest key `k` |
| /// such that `k > k0`, where `k0` is the most recent key lookup. Note that |
| /// in particular, `k0` may not be in the table! |
| next: usize, |
| } |
| |
| impl SimpleCaseFolder { |
| /// Create a new simple case folder, returning an error if the underlying |
| /// case folding table is unavailable. |
| pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> { |
| #[cfg(not(feature = "unicode-case"))] |
| { |
| Err(CaseFoldError(())) |
| } |
| #[cfg(feature = "unicode-case")] |
| { |
| Ok(SimpleCaseFolder { |
| table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, |
| last: None, |
| next: 0, |
| }) |
| } |
| } |
| |
| /// Return the equivalence class of case folded codepoints for the given |
| /// codepoint. The equivalence class returned never includes the codepoint |
| /// given. If the given codepoint has no case folded codepoints (i.e., |
| /// no entry in the underlying case folding table), then this returns an |
| /// empty slice. |
| /// |
| /// # Panics |
| /// |
| /// This panics when called with a `c` that is less than or equal to the |
| /// previous call. In other words, callers need to use this method with |
| /// strictly increasing values of `c`. |
| pub fn mapping(&mut self, c: char) -> &'static [char] { |
| if let Some(last) = self.last { |
| assert!( |
| last < c, |
| "got codepoint U+{:X} which occurs before \ |
| last codepoint U+{:X}", |
| u32::from(c), |
| u32::from(last), |
| ); |
| } |
| self.last = Some(c); |
| if self.next >= self.table.len() { |
| return &[]; |
| } |
| let (k, v) = self.table[self.next]; |
| if k == c { |
| self.next += 1; |
| return v; |
| } |
| match self.get(c) { |
| Err(i) => { |
| self.next = i; |
| &[] |
| } |
| Ok(i) => { |
| // Since we require lookups to proceed |
| // in order, anything we find should be |
| // after whatever we thought might be |
| // next. Otherwise, the caller is either |
| // going out of order or we would have |
| // found our next key at 'self.next'. |
| assert!(i > self.next); |
| self.next = i + 1; |
| self.table[i].1 |
| } |
| } |
| } |
| |
| /// Returns true if and only if the given range overlaps with any region |
| /// of the underlying case folding table. That is, when true, there exists |
| /// at least one codepoint in the inclusive range `[start, end]` that has |
| /// a non-trivial equivalence class of case folded codepoints. Conversely, |
| /// when this returns false, all codepoints in the range `[start, end]` |
| /// correspond to the trivial equivalence class of case folded codepoints, |
| /// i.e., itself. |
| /// |
| /// This is useful to call before iterating over the codepoints in the |
| /// range and looking up the mapping for each. If you know none of the |
| /// mappings will return anything, then you might be able to skip doing it |
| /// altogether. |
| /// |
| /// # Panics |
| /// |
| /// This panics when `end < start`. |
| pub fn overlaps(&self, start: char, end: char) -> bool { |
| use core::cmp::Ordering; |
| |
| assert!(start <= end); |
| self.table |
| .binary_search_by(|&(c, _)| { |
| if start <= c && c <= end { |
| Ordering::Equal |
| } else if c > end { |
| Ordering::Greater |
| } else { |
| Ordering::Less |
| } |
| }) |
| .is_ok() |
| } |
| |
| /// Returns the index at which `c` occurs in the simple case fold table. If |
| /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < |
| /// c` and `table[i].0 > c`. |
| fn get(&self, c: char) -> Result<usize, usize> { |
| self.table.binary_search_by_key(&c, |&(c1, _)| c1) |
| } |
| } |
| |
| /// A query for finding a character class defined by Unicode. This supports |
| /// either use of a property name directly, or lookup by property value. The |
| /// former generally refers to Binary properties (see UTS#44, Table 8), but |
| /// as a special exception (see UTS#18, Section 1.2) both general categories |
| /// (an enumeration) and scripts (a catalog) are supported as if each of their |
| /// possible values were a binary property. |
| /// |
| /// In all circumstances, property names and values are normalized and |
| /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. |
| /// |
| /// The lifetime `'a` refers to the shorter of the lifetimes of property name |
| /// and property value. |
| #[derive(Debug)] |
| pub enum ClassQuery<'a> { |
| /// Return a class corresponding to a Unicode binary property, named by |
| /// a single letter. |
| OneLetter(char), |
| /// Return a class corresponding to a Unicode binary property. |
| /// |
| /// Note that, by special exception (see UTS#18, Section 1.2), both |
| /// general category values and script values are permitted here as if |
| /// they were a binary property. |
| Binary(&'a str), |
| /// Return a class corresponding to all codepoints whose property |
| /// (identified by `property_name`) corresponds to the given value |
| /// (identified by `property_value`). |
| ByValue { |
| /// A property name. |
| property_name: &'a str, |
| /// A property value. |
| property_value: &'a str, |
| }, |
| } |
| |
| impl<'a> ClassQuery<'a> { |
| fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> { |
| match *self { |
| ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), |
| ClassQuery::Binary(name) => self.canonical_binary(name), |
| ClassQuery::ByValue { property_name, property_value } => { |
| let property_name = symbolic_name_normalize(property_name); |
| let property_value = symbolic_name_normalize(property_value); |
| |
| let canon_name = match canonical_prop(&property_name)? { |
| None => return Err(Error::PropertyNotFound), |
| Some(canon_name) => canon_name, |
| }; |
| Ok(match canon_name { |
| "General_Category" => { |
| let canon = match canonical_gencat(&property_value)? { |
| None => return Err(Error::PropertyValueNotFound), |
| Some(canon) => canon, |
| }; |
| CanonicalClassQuery::GeneralCategory(canon) |
| } |
| "Script" => { |
| let canon = match canonical_script(&property_value)? { |
| None => return Err(Error::PropertyValueNotFound), |
| Some(canon) => canon, |
| }; |
| CanonicalClassQuery::Script(canon) |
| } |
| _ => { |
| let vals = match property_values(canon_name)? { |
| None => return Err(Error::PropertyValueNotFound), |
| Some(vals) => vals, |
| }; |
| let canon_val = |
| match canonical_value(vals, &property_value) { |
| None => { |
| return Err(Error::PropertyValueNotFound) |
| } |
| Some(canon_val) => canon_val, |
| }; |
| CanonicalClassQuery::ByValue { |
| property_name: canon_name, |
| property_value: canon_val, |
| } |
| } |
| }) |
| } |
| } |
| } |
| |
| fn canonical_binary( |
| &self, |
| name: &str, |
| ) -> Result<CanonicalClassQuery, Error> { |
| let norm = symbolic_name_normalize(name); |
| |
| // This is a special case where 'cf' refers to the 'Format' general |
| // category, but where the 'cf' abbreviation is also an abbreviation |
| // for the 'Case_Folding' property. But we want to treat it as |
| // a general category. (Currently, we don't even support the |
| // 'Case_Folding' property. But if we do in the future, users will be |
| // required to spell it out.) |
| // |
| // Also 'sc' refers to the 'Currency_Symbol' general category, but is |
| // also the abbreviation for the 'Script' property. So we avoid calling |
| // 'canonical_prop' for it too, which would erroneously normalize it |
| // to 'Script'. |
| // |
| // Another case: 'lc' is an abbreviation for the 'Cased_Letter' |
| // general category, but is also an abbreviation for the 'Lowercase_Mapping' |
| // property. We don't currently support the latter, so as with 'cf' |
| // above, we treat 'lc' as 'Cased_Letter'. |
| if norm != "cf" && norm != "sc" && norm != "lc" { |
| if let Some(canon) = canonical_prop(&norm)? { |
| return Ok(CanonicalClassQuery::Binary(canon)); |
| } |
| } |
| if let Some(canon) = canonical_gencat(&norm)? { |
| return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
| } |
| if let Some(canon) = canonical_script(&norm)? { |
| return Ok(CanonicalClassQuery::Script(canon)); |
| } |
| Err(Error::PropertyNotFound) |
| } |
| } |
| |
| /// Like ClassQuery, but its parameters have been canonicalized. This also |
| /// differentiates binary properties from flattened general categories and |
| /// scripts. |
| #[derive(Debug, Eq, PartialEq)] |
| enum CanonicalClassQuery { |
| /// The canonical binary property name. |
| Binary(&'static str), |
| /// The canonical general category name. |
| GeneralCategory(&'static str), |
| /// The canonical script name. |
| Script(&'static str), |
| /// An arbitrary association between property and value, both of which |
| /// have been canonicalized. |
| /// |
| /// Note that by construction, the property name of ByValue will never |
| /// be General_Category or Script. Those two cases are subsumed by the |
| /// eponymous variants. |
| ByValue { |
| /// The canonical property name. |
| property_name: &'static str, |
| /// The canonical property value. |
| property_value: &'static str, |
| }, |
| } |
| |
| /// Looks up a Unicode class given a query. If one doesn't exist, then |
| /// `None` is returned. |
| pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> { |
| use self::CanonicalClassQuery::*; |
| |
| match query.canonicalize()? { |
| Binary(name) => bool_property(name), |
| GeneralCategory(name) => gencat(name), |
| Script(name) => script(name), |
| ByValue { property_name: "Age", property_value } => { |
| let mut class = hir::ClassUnicode::empty(); |
| for set in ages(property_value)? { |
| class.union(&hir_class(set)); |
| } |
| Ok(class) |
| } |
| ByValue { property_name: "Script_Extensions", property_value } => { |
| script_extension(property_value) |
| } |
| ByValue { |
| property_name: "Grapheme_Cluster_Break", |
| property_value, |
| } => gcb(property_value), |
| ByValue { property_name: "Sentence_Break", property_value } => { |
| sb(property_value) |
| } |
| ByValue { property_name: "Word_Break", property_value } => { |
| wb(property_value) |
| } |
| _ => { |
| // What else should we support? |
| Err(Error::PropertyNotFound) |
| } |
| } |
| } |
| |
| /// Returns a Unicode aware class for \w. |
| /// |
| /// This returns an error if the data is not available for \w. |
| pub fn perl_word() -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-perl"))] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PerlClassNotFound) |
| } |
| |
| #[cfg(feature = "unicode-perl")] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::perl_word::PERL_WORD; |
| Ok(hir_class(PERL_WORD)) |
| } |
| |
| imp() |
| } |
| |
| /// Returns a Unicode aware class for \s. |
| /// |
| /// This returns an error if the data is not available for \s. |
| pub fn perl_space() -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PerlClassNotFound) |
| } |
| |
| #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::perl_space::WHITE_SPACE; |
| Ok(hir_class(WHITE_SPACE)) |
| } |
| |
| #[cfg(feature = "unicode-bool")] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::property_bool::WHITE_SPACE; |
| Ok(hir_class(WHITE_SPACE)) |
| } |
| |
| imp() |
| } |
| |
| /// Returns a Unicode aware class for \d. |
| /// |
| /// This returns an error if the data is not available for \d. |
| pub fn perl_digit() -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PerlClassNotFound) |
| } |
| |
| #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; |
| Ok(hir_class(DECIMAL_NUMBER)) |
| } |
| |
| #[cfg(feature = "unicode-gencat")] |
| fn imp() -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::general_category::DECIMAL_NUMBER; |
| Ok(hir_class(DECIMAL_NUMBER)) |
| } |
| |
| imp() |
| } |
| |
| /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
| pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { |
| let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges |
| .iter() |
| .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
| .collect(); |
| hir::ClassUnicode::new(hir_ranges) |
| } |
| |
| /// Returns true only if the given codepoint is in the `\w` character class. |
| /// |
| /// If the `unicode-perl` feature is not enabled, then this returns an error. |
| pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> { |
| #[cfg(not(feature = "unicode-perl"))] |
| fn imp(_: char) -> Result<bool, UnicodeWordError> { |
| Err(UnicodeWordError(())) |
| } |
| |
| #[cfg(feature = "unicode-perl")] |
| fn imp(c: char) -> Result<bool, UnicodeWordError> { |
| use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; |
| |
| if u8::try_from(c).map_or(false, is_word_byte) { |
| return Ok(true); |
| } |
| Ok(PERL_WORD |
| .binary_search_by(|&(start, end)| { |
| use core::cmp::Ordering; |
| |
| if start <= c && c <= end { |
| Ordering::Equal |
| } else if start > c { |
| Ordering::Greater |
| } else { |
| Ordering::Less |
| } |
| }) |
| .is_ok()) |
| } |
| |
| imp(c) |
| } |
| |
| /// A mapping of property values for a specific property. |
| /// |
| /// The first element of each tuple is a normalized property value while the |
| /// second element of each tuple is the corresponding canonical property |
| /// value. |
| type PropertyValues = &'static [(&'static str, &'static str)]; |
| |
| fn canonical_gencat( |
| normalized_value: &str, |
| ) -> Result<Option<&'static str>, Error> { |
| Ok(match normalized_value { |
| "any" => Some("Any"), |
| "assigned" => Some("Assigned"), |
| "ascii" => Some("ASCII"), |
| _ => { |
| let gencats = property_values("General_Category")?.unwrap(); |
| canonical_value(gencats, normalized_value) |
| } |
| }) |
| } |
| |
| fn canonical_script( |
| normalized_value: &str, |
| ) -> Result<Option<&'static str>, Error> { |
| let scripts = property_values("Script")?.unwrap(); |
| Ok(canonical_value(scripts, normalized_value)) |
| } |
| |
| /// Find the canonical property name for the given normalized property name. |
| /// |
| /// If no such property exists, then `None` is returned. |
| /// |
| /// The normalized property name must have been normalized according to |
| /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
| /// |
| /// If the property names data is not available, then an error is returned. |
| fn canonical_prop( |
| normalized_name: &str, |
| ) -> Result<Option<&'static str>, Error> { |
| #[cfg(not(any( |
| feature = "unicode-age", |
| feature = "unicode-bool", |
| feature = "unicode-gencat", |
| feature = "unicode-perl", |
| feature = "unicode-script", |
| feature = "unicode-segment", |
| )))] |
| fn imp(_: &str) -> Result<Option<&'static str>, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(any( |
| feature = "unicode-age", |
| feature = "unicode-bool", |
| feature = "unicode-gencat", |
| feature = "unicode-perl", |
| feature = "unicode-script", |
| feature = "unicode-segment", |
| ))] |
| fn imp(name: &str) -> Result<Option<&'static str>, Error> { |
| use crate::unicode_tables::property_names::PROPERTY_NAMES; |
| |
| Ok(PROPERTY_NAMES |
| .binary_search_by_key(&name, |&(n, _)| n) |
| .ok() |
| .map(|i| PROPERTY_NAMES[i].1)) |
| } |
| |
| imp(normalized_name) |
| } |
| |
| /// Find the canonical property value for the given normalized property |
| /// value. |
| /// |
| /// The given property values should correspond to the values for the property |
| /// under question, which can be found using `property_values`. |
| /// |
| /// If no such property value exists, then `None` is returned. |
| /// |
| /// The normalized property value must have been normalized according to |
| /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
| fn canonical_value( |
| vals: PropertyValues, |
| normalized_value: &str, |
| ) -> Option<&'static str> { |
| vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
| .ok() |
| .map(|i| vals[i].1) |
| } |
| |
| /// Return the table of property values for the given property name. |
| /// |
| /// If the property values data is not available, then an error is returned. |
| fn property_values( |
| canonical_property_name: &'static str, |
| ) -> Result<Option<PropertyValues>, Error> { |
| #[cfg(not(any( |
| feature = "unicode-age", |
| feature = "unicode-bool", |
| feature = "unicode-gencat", |
| feature = "unicode-perl", |
| feature = "unicode-script", |
| feature = "unicode-segment", |
| )))] |
| fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> { |
| Err(Error::PropertyValueNotFound) |
| } |
| |
| #[cfg(any( |
| feature = "unicode-age", |
| feature = "unicode-bool", |
| feature = "unicode-gencat", |
| feature = "unicode-perl", |
| feature = "unicode-script", |
| feature = "unicode-segment", |
| ))] |
| fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> { |
| use crate::unicode_tables::property_values::PROPERTY_VALUES; |
| |
| Ok(PROPERTY_VALUES |
| .binary_search_by_key(&name, |&(n, _)| n) |
| .ok() |
| .map(|i| PROPERTY_VALUES[i].1)) |
| } |
| |
| imp(canonical_property_name) |
| } |
| |
| // This is only used in some cases, but small enough to just let it be dead |
| // instead of figuring out (and maintaining) the right set of features. |
| #[allow(dead_code)] |
| fn property_set( |
| name_map: &'static [(&'static str, Range)], |
| canonical: &'static str, |
| ) -> Option<Range> { |
| name_map |
| .binary_search_by_key(&canonical, |x| x.0) |
| .ok() |
| .map(|i| name_map[i].1) |
| } |
| |
| /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
| /// of codepoints that were added in a particular revision of Unicode. The |
| /// iterator yields items in chronological order. |
| /// |
| /// If the given age value isn't valid or if the data isn't available, then an |
| /// error is returned instead. |
| fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> { |
| #[cfg(not(feature = "unicode-age"))] |
| fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> { |
| use core::option::IntoIter; |
| Err::<IntoIter<Range>, _>(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-age")] |
| fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> { |
| use crate::unicode_tables::age; |
| |
| const AGES: &[(&str, Range)] = &[ |
| ("V1_1", age::V1_1), |
| ("V2_0", age::V2_0), |
| ("V2_1", age::V2_1), |
| ("V3_0", age::V3_0), |
| ("V3_1", age::V3_1), |
| ("V3_2", age::V3_2), |
| ("V4_0", age::V4_0), |
| ("V4_1", age::V4_1), |
| ("V5_0", age::V5_0), |
| ("V5_1", age::V5_1), |
| ("V5_2", age::V5_2), |
| ("V6_0", age::V6_0), |
| ("V6_1", age::V6_1), |
| ("V6_2", age::V6_2), |
| ("V6_3", age::V6_3), |
| ("V7_0", age::V7_0), |
| ("V8_0", age::V8_0), |
| ("V9_0", age::V9_0), |
| ("V10_0", age::V10_0), |
| ("V11_0", age::V11_0), |
| ("V12_0", age::V12_0), |
| ("V12_1", age::V12_1), |
| ("V13_0", age::V13_0), |
| ("V14_0", age::V14_0), |
| ("V15_0", age::V15_0), |
| ]; |
| assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); |
| |
| let pos = AGES.iter().position(|&(age, _)| canonical_age == age); |
| match pos { |
| None => Err(Error::PropertyValueNotFound), |
| Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), |
| } |
| } |
| |
| imp(canonical_age) |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given general category. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given general category could not be found, or if the general |
| /// category data is not available, then an error is returned. |
| fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-gencat"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-gencat")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::general_category::BY_NAME; |
| match name { |
| "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), |
| "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), |
| "Assigned" => { |
| let mut cls = gencat("Unassigned")?; |
| cls.negate(); |
| Ok(cls) |
| } |
| name => property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyValueNotFound), |
| } |
| } |
| |
| match canonical_name { |
| "Decimal_Number" => perl_digit(), |
| name => imp(name), |
| } |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given script. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given script could not be found, or if the script data is not |
| /// available, then an error is returned. |
| fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-script"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-script")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::script::BY_NAME; |
| property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyValueNotFound) |
| } |
| |
| imp(canonical_name) |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given script extension. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given script extension could not be found, or if the script data is |
| /// not available, then an error is returned. |
| fn script_extension( |
| canonical_name: &'static str, |
| ) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-script"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-script")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::script_extension::BY_NAME; |
| property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyValueNotFound) |
| } |
| |
| imp(canonical_name) |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given Unicode boolean |
| /// property. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given boolean property could not be found, or if the boolean |
| /// property data is not available, then an error is returned. |
| fn bool_property( |
| canonical_name: &'static str, |
| ) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-bool"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-bool")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::property_bool::BY_NAME; |
| property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyNotFound) |
| } |
| |
| match canonical_name { |
| "Decimal_Number" => perl_digit(), |
| "White_Space" => perl_space(), |
| name => imp(name), |
| } |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given grapheme cluster |
| /// break property. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given property could not be found, or if the corresponding data is |
| /// not available, then an error is returned. |
| fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-segment"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-segment")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::grapheme_cluster_break::BY_NAME; |
| property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyValueNotFound) |
| } |
| |
| imp(canonical_name) |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given word break |
| /// property. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given property could not be found, or if the corresponding data is |
| /// not available, then an error is returned. |
| fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-segment"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-segment")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::word_break::BY_NAME; |
| property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyValueNotFound) |
| } |
| |
| imp(canonical_name) |
| } |
| |
| /// Returns the Unicode HIR class corresponding to the given sentence |
| /// break property. |
| /// |
| /// Name canonicalization is assumed to be performed by the caller. |
| /// |
| /// If the given property could not be found, or if the corresponding data is |
| /// not available, then an error is returned. |
| fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| #[cfg(not(feature = "unicode-segment"))] |
| fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> { |
| Err(Error::PropertyNotFound) |
| } |
| |
| #[cfg(feature = "unicode-segment")] |
| fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> { |
| use crate::unicode_tables::sentence_break::BY_NAME; |
| property_set(BY_NAME, name) |
| .map(hir_class) |
| .ok_or(Error::PropertyValueNotFound) |
| } |
| |
| imp(canonical_name) |
| } |
| |
| /// Like symbolic_name_normalize_bytes, but operates on a string. |
| fn symbolic_name_normalize(x: &str) -> String { |
| let mut tmp = x.as_bytes().to_vec(); |
| let len = symbolic_name_normalize_bytes(&mut tmp).len(); |
| tmp.truncate(len); |
| // This should always succeed because `symbolic_name_normalize_bytes` |
| // guarantees that `&tmp[..len]` is always valid UTF-8. |
| // |
| // N.B. We could avoid the additional UTF-8 check here, but it's unlikely |
| // to be worth skipping the additional safety check. A benchmark must |
| // justify it first. |
| String::from_utf8(tmp).unwrap() |
| } |
| |
| /// Normalize the given symbolic name in place according to UAX44-LM3. |
| /// |
| /// A "symbolic name" typically corresponds to property names and property |
| /// value aliases. Note, though, that it should not be applied to property |
| /// string values. |
| /// |
| /// The slice returned is guaranteed to be valid UTF-8 for all possible values |
| /// of `slice`. |
| /// |
| /// See: https://unicode.org/reports/tr44/#UAX44-LM3 |
| fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
| // I couldn't find a place in the standard that specified that property |
| // names/aliases had a particular structure (unlike character names), but |
| // we assume that it's ASCII only and drop anything that isn't ASCII. |
| let mut start = 0; |
| let mut starts_with_is = false; |
| if slice.len() >= 2 { |
| // Ignore any "is" prefix. |
| starts_with_is = slice[0..2] == b"is"[..] |
| || slice[0..2] == b"IS"[..] |
| || slice[0..2] == b"iS"[..] |
| || slice[0..2] == b"Is"[..]; |
| if starts_with_is { |
| start = 2; |
| } |
| } |
| let mut next_write = 0; |
| for i in start..slice.len() { |
| // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid |
| // UTF-8, we ensure that the slice contains only ASCII bytes. In |
| // particular, we drop every non-ASCII byte from the normalized string. |
| let b = slice[i]; |
| if b == b' ' || b == b'_' || b == b'-' { |
| continue; |
| } else if b'A' <= b && b <= b'Z' { |
| slice[next_write] = b + (b'a' - b'A'); |
| next_write += 1; |
| } else if b <= 0x7F { |
| slice[next_write] = b; |
| next_write += 1; |
| } |
| } |
| // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally |
| // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross |
| // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it |
| // is actually an alias for the 'Other' general category. |
| if starts_with_is && next_write == 1 && slice[0] == b'c' { |
| slice[0] = b'i'; |
| slice[1] = b's'; |
| slice[2] = b'c'; |
| next_write = 3; |
| } |
| &mut slice[..next_write] |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[cfg(feature = "unicode-case")] |
| fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { |
| SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() |
| } |
| |
| #[cfg(feature = "unicode-case")] |
| fn contains_case_map(start: char, end: char) -> bool { |
| SimpleCaseFolder::new().unwrap().overlaps(start, end) |
| } |
| |
| #[test] |
| #[cfg(feature = "unicode-case")] |
| fn simple_fold_k() { |
| let xs: Vec<char> = simple_fold_ok('k').collect(); |
| assert_eq!(xs, alloc::vec!['K', 'K']); |
| |
| let xs: Vec<char> = simple_fold_ok('K').collect(); |
| assert_eq!(xs, alloc::vec!['k', 'K']); |
| |
| let xs: Vec<char> = simple_fold_ok('K').collect(); |
| assert_eq!(xs, alloc::vec!['K', 'k']); |
| } |
| |
| #[test] |
| #[cfg(feature = "unicode-case")] |
| fn simple_fold_a() { |
| let xs: Vec<char> = simple_fold_ok('a').collect(); |
| assert_eq!(xs, alloc::vec!['A']); |
| |
| let xs: Vec<char> = simple_fold_ok('A').collect(); |
| assert_eq!(xs, alloc::vec!['a']); |
| } |
| |
| #[test] |
| #[cfg(not(feature = "unicode-case"))] |
| fn simple_fold_disabled() { |
| assert!(SimpleCaseFolder::new().is_err()); |
| } |
| |
| #[test] |
| #[cfg(feature = "unicode-case")] |
| fn range_contains() { |
| assert!(contains_case_map('A', 'A')); |
| assert!(contains_case_map('Z', 'Z')); |
| assert!(contains_case_map('A', 'Z')); |
| assert!(contains_case_map('@', 'A')); |
| assert!(contains_case_map('Z', '[')); |
| assert!(contains_case_map('☃', 'Ⰰ')); |
| |
| assert!(!contains_case_map('[', '[')); |
| assert!(!contains_case_map('[', '`')); |
| |
| assert!(!contains_case_map('☃', '☃')); |
| } |
| |
| #[test] |
| #[cfg(feature = "unicode-gencat")] |
| fn regression_466() { |
| use super::{CanonicalClassQuery, ClassQuery}; |
| |
| let q = ClassQuery::OneLetter('C'); |
| assert_eq!( |
| q.canonicalize().unwrap(), |
| CanonicalClassQuery::GeneralCategory("Other") |
| ); |
| } |
| |
| #[test] |
| fn sym_normalize() { |
| let sym_norm = symbolic_name_normalize; |
| |
| assert_eq!(sym_norm("Line_Break"), "linebreak"); |
| assert_eq!(sym_norm("Line-break"), "linebreak"); |
| assert_eq!(sym_norm("linebreak"), "linebreak"); |
| assert_eq!(sym_norm("BA"), "ba"); |
| assert_eq!(sym_norm("ba"), "ba"); |
| assert_eq!(sym_norm("Greek"), "greek"); |
| assert_eq!(sym_norm("isGreek"), "greek"); |
| assert_eq!(sym_norm("IS_Greek"), "greek"); |
| assert_eq!(sym_norm("isc"), "isc"); |
| assert_eq!(sym_norm("is c"), "isc"); |
| assert_eq!(sym_norm("is_c"), "isc"); |
| } |
| |
| #[test] |
| fn valid_utf8_symbolic() { |
| let mut x = b"abc\xFFxyz".to_vec(); |
| let y = symbolic_name_normalize_bytes(&mut x); |
| assert_eq!(y, b"abcxyz"); |
| } |
| } |