| // This file is part of ICU4X. For terms of use, please see the file |
| // called LICENSE at the top level of the ICU4X source tree |
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| |
| use core::cmp::Ordering; |
| use core::str::FromStr; |
| |
| use crate::ordering::SubtagOrderingResult; |
| use crate::parser::{ |
| parse_language_identifier, parse_language_identifier_with_single_variant, ParserError, |
| ParserMode, SubtagIterator, |
| }; |
| use crate::subtags; |
| use alloc::string::String; |
| use writeable::Writeable; |
| |
| /// A core struct representing a [`Unicode BCP47 Language Identifier`]. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{ |
| /// langid, subtags_language as language, subtags_region as region, |
| /// }; |
| /// |
| /// let li = langid!("en-US"); |
| /// |
| /// assert_eq!(li.language, language!("en")); |
| /// assert_eq!(li.script, None); |
| /// assert_eq!(li.region, Some(region!("US"))); |
| /// assert_eq!(li.variants.len(), 0); |
| /// ``` |
| /// |
| /// # Parsing |
| /// |
| /// Unicode recognizes three levels of standard conformance for any language identifier: |
| /// |
| /// * *well-formed* - syntactically correct |
| /// * *valid* - well-formed and only uses registered language, region, script and variant subtags... |
| /// * *canonical* - valid and no deprecated codes or structure. |
| /// |
| /// At the moment parsing normalizes a well-formed language identifier converting |
| /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. |
| /// |
| /// Any bogus subtags will cause the parsing to fail with an error. |
| /// No subtag validation is performed. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{ |
| /// langid, subtags_language as language, subtags_region as region, |
| /// subtags_script as script, subtags_variant as variant, |
| /// }; |
| /// |
| /// let li = langid!("eN_latn_Us-Valencia"); |
| /// |
| /// assert_eq!(li.language, language!("en")); |
| /// assert_eq!(li.script, Some(script!("Latn"))); |
| /// assert_eq!(li.region, Some(region!("US"))); |
| /// assert_eq!(li.variants.get(0), Some(&variant!("valencia"))); |
| /// ``` |
| /// |
| /// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier |
| #[derive(Default, PartialEq, Eq, Clone, Hash)] |
| #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) |
| pub struct LanguageIdentifier { |
| /// Language subtag of the language identifier. |
| pub language: subtags::Language, |
| /// Script subtag of the language identifier. |
| pub script: Option<subtags::Script>, |
| /// Region subtag of the language identifier. |
| pub region: Option<subtags::Region>, |
| /// Variant subtags of the language identifier. |
| pub variants: subtags::Variants, |
| } |
| |
| impl LanguageIdentifier { |
| /// A constructor which takes a utf8 slice, parses it and |
| /// produces a well-formed [`LanguageIdentifier`]. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::LanguageIdentifier; |
| /// |
| /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed"); |
| /// ``` |
| pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { |
| parse_language_identifier(v, ParserMode::LanguageIdentifier) |
| } |
| |
| #[doc(hidden)] |
| #[allow(clippy::type_complexity)] |
| // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops` |
| // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)). |
| pub const fn try_from_bytes_with_single_variant( |
| v: &[u8], |
| ) -> Result< |
| ( |
| subtags::Language, |
| Option<subtags::Script>, |
| Option<subtags::Region>, |
| Option<subtags::Variant>, |
| ), |
| ParserError, |
| > { |
| parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier) |
| } |
| |
| /// A constructor which takes a utf8 slice which may contain extension keys, |
| /// parses it and produces a well-formed [`LanguageIdentifier`]. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{langid, LanguageIdentifier}; |
| /// |
| /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix") |
| /// .expect("Parsing failed."); |
| /// |
| /// assert_eq!(li, langid!("en-US")); |
| /// ``` |
| /// |
| /// This method should be used for input that may be a locale identifier. |
| /// All extensions will be lost. |
| pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> { |
| parse_language_identifier(v, ParserMode::Locale) |
| } |
| |
| /// The default undefined language "und". Same as [`default()`](Default::default()). |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::LanguageIdentifier; |
| /// |
| /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND); |
| /// ``` |
| pub const UND: Self = Self { |
| language: subtags::Language::UND, |
| script: None, |
| region: None, |
| variants: subtags::Variants::new(), |
| }; |
| |
| /// This is a best-effort operation that performs all available levels of canonicalization. |
| /// |
| /// At the moment the operation will normalize casing and the separator, but in the future |
| /// it may also validate and update from deprecated subtags to canonical ones. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::LanguageIdentifier; |
| /// |
| /// assert_eq!( |
| /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(), |
| /// Ok("pl-Latn-PL") |
| /// ); |
| /// ``` |
| pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { |
| let lang_id = Self::try_from_bytes(input.as_ref())?; |
| Ok(lang_id.write_to_string().into_owned()) |
| } |
| |
| /// Compare this [`LanguageIdentifier`] with BCP-47 bytes. |
| /// |
| /// The return value is equivalent to what would happen if you first converted this |
| /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison. |
| /// |
| /// This function is case-sensitive and results in a *total order*, so it is appropriate for |
| /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::LanguageIdentifier; |
| /// use std::cmp::Ordering; |
| /// |
| /// let bcp47_strings: &[&str] = &[ |
| /// "pl-Latn-PL", |
| /// "und", |
| /// "und-Adlm", |
| /// "und-GB", |
| /// "und-ZA", |
| /// "und-fonipa", |
| /// "zh", |
| /// ]; |
| /// |
| /// for ab in bcp47_strings.windows(2) { |
| /// let a = ab[0]; |
| /// let b = ab[1]; |
| /// assert!(a.cmp(b) == Ordering::Less); |
| /// let a_langid = a.parse::<LanguageIdentifier>().unwrap(); |
| /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal); |
| /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less); |
| /// } |
| /// ``` |
| pub fn strict_cmp(&self, other: &[u8]) -> Ordering { |
| self.strict_cmp_iter(other.split(|b| *b == b'-')).end() |
| } |
| |
| /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags. |
| /// |
| /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as |
| /// a more modular version that allows multiple subtag iterators to be chained together. |
| /// |
| /// For an additional example, see [`SubtagOrderingResult`]. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::LanguageIdentifier; |
| /// use std::cmp::Ordering; |
| /// |
| /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"]; |
| /// |
| /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap(); |
| /// assert_eq!( |
| /// Ordering::Equal, |
| /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
| /// ); |
| /// |
| /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap(); |
| /// assert_eq!( |
| /// Ordering::Less, |
| /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
| /// ); |
| /// |
| /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap(); |
| /// assert_eq!( |
| /// Ordering::Greater, |
| /// loc.strict_cmp_iter(subtags.iter().copied()).end() |
| /// ); |
| /// ``` |
| pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> |
| where |
| I: Iterator<Item = &'l [u8]>, |
| { |
| let r = self.for_each_subtag_str(&mut |subtag| { |
| if let Some(other) = subtags.next() { |
| match subtag.as_bytes().cmp(other) { |
| Ordering::Equal => Ok(()), |
| not_equal => Err(not_equal), |
| } |
| } else { |
| Err(Ordering::Greater) |
| } |
| }); |
| match r { |
| Ok(_) => SubtagOrderingResult::Subtags(subtags), |
| Err(o) => SubtagOrderingResult::Ordering(o), |
| } |
| } |
| |
| /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string. |
| /// |
| /// The return value is equivalent to what would happen if you first parsed the |
| /// BCP-47 string to a `LanguageIdentifier` and then performed a structucal comparison. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::LanguageIdentifier; |
| /// use std::cmp::Ordering; |
| /// |
| /// let bcp47_strings: &[&str] = &[ |
| /// "pl-LaTn-pL", |
| /// "uNd", |
| /// "UnD-adlm", |
| /// "uNd-GB", |
| /// "UND-FONIPA", |
| /// "ZH", |
| /// ]; |
| /// |
| /// for a in bcp47_strings { |
| /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a)); |
| /// } |
| /// ``` |
| pub fn normalizing_eq(&self, other: &str) -> bool { |
| macro_rules! subtag_matches { |
| ($T:ty, $iter:ident, $expected:expr) => { |
| $iter |
| .next() |
| .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) |
| .unwrap_or(false) |
| }; |
| } |
| |
| let mut iter = SubtagIterator::new(other.as_bytes()); |
| if !subtag_matches!(subtags::Language, iter, self.language) { |
| return false; |
| } |
| if let Some(ref script) = self.script { |
| if !subtag_matches!(subtags::Script, iter, *script) { |
| return false; |
| } |
| } |
| if let Some(ref region) = self.region { |
| if !subtag_matches!(subtags::Region, iter, *region) { |
| return false; |
| } |
| } |
| for variant in self.variants.iter() { |
| if !subtag_matches!(subtags::Variant, iter, *variant) { |
| return false; |
| } |
| } |
| iter.next().is_none() |
| } |
| |
| pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> |
| where |
| F: FnMut(&str) -> Result<(), E>, |
| { |
| f(self.language.as_str())?; |
| if let Some(ref script) = self.script { |
| f(script.as_str())?; |
| } |
| if let Some(ref region) = self.region { |
| f(region.as_str())?; |
| } |
| for variant in self.variants.iter() { |
| f(variant.as_str())?; |
| } |
| Ok(()) |
| } |
| } |
| |
| impl AsRef<LanguageIdentifier> for LanguageIdentifier { |
| fn as_ref(&self) -> &Self { |
| self |
| } |
| } |
| |
| impl AsMut<LanguageIdentifier> for LanguageIdentifier { |
| fn as_mut(&mut self) -> &mut Self { |
| self |
| } |
| } |
| |
| impl core::fmt::Debug for LanguageIdentifier { |
| fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| core::fmt::Display::fmt(&self, f) |
| } |
| } |
| |
| impl FromStr for LanguageIdentifier { |
| type Err = ParserError; |
| |
| fn from_str(source: &str) -> Result<Self, Self::Err> { |
| Self::try_from_bytes(source.as_bytes()) |
| } |
| } |
| |
| impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string()); |
| |
| #[test] |
| fn test_writeable() { |
| use writeable::assert_writeable_eq; |
| assert_writeable_eq!(LanguageIdentifier::UND, "und"); |
| assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001"); |
| assert_writeable_eq!( |
| "und-Mymr".parse::<LanguageIdentifier>().unwrap(), |
| "und-Mymr", |
| ); |
| assert_writeable_eq!( |
| "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(), |
| "my-Mymr-MM", |
| ); |
| assert_writeable_eq!( |
| "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(), |
| "my-Mymr-MM-posix", |
| ); |
| assert_writeable_eq!( |
| "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(), |
| "zh-macos-posix", |
| ); |
| } |
| |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{ |
| /// langid, subtags_language as language, LanguageIdentifier, |
| /// }; |
| /// |
| /// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en")); |
| /// ``` |
| impl From<subtags::Language> for LanguageIdentifier { |
| fn from(language: subtags::Language) -> Self { |
| Self { |
| language, |
| ..Default::default() |
| } |
| } |
| } |
| |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{langid, subtags_script as script, LanguageIdentifier}; |
| /// |
| /// assert_eq!( |
| /// LanguageIdentifier::from(Some(script!("latn"))), |
| /// langid!("und-Latn") |
| /// ); |
| /// ``` |
| impl From<Option<subtags::Script>> for LanguageIdentifier { |
| fn from(script: Option<subtags::Script>) -> Self { |
| Self { |
| script, |
| ..Default::default() |
| } |
| } |
| } |
| |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{langid, subtags_region as region, LanguageIdentifier}; |
| /// |
| /// assert_eq!( |
| /// LanguageIdentifier::from(Some(region!("US"))), |
| /// langid!("und-US") |
| /// ); |
| /// ``` |
| impl From<Option<subtags::Region>> for LanguageIdentifier { |
| fn from(region: Option<subtags::Region>) -> Self { |
| Self { |
| region, |
| ..Default::default() |
| } |
| } |
| } |
| |
| /// Convert from an LSR tuple to a [`LanguageIdentifier`]. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{ |
| /// langid, subtags_language as language, subtags_region as region, |
| /// subtags_script as script, LanguageIdentifier, |
| /// }; |
| /// |
| /// let lang = language!("en"); |
| /// let script = script!("Latn"); |
| /// let region = region!("US"); |
| /// assert_eq!( |
| /// LanguageIdentifier::from((lang, Some(script), Some(region))), |
| /// langid!("en-Latn-US") |
| /// ); |
| /// ``` |
| impl |
| From<( |
| subtags::Language, |
| Option<subtags::Script>, |
| Option<subtags::Region>, |
| )> for LanguageIdentifier |
| { |
| fn from( |
| lsr: ( |
| subtags::Language, |
| Option<subtags::Script>, |
| Option<subtags::Region>, |
| ), |
| ) -> Self { |
| Self { |
| language: lsr.0, |
| script: lsr.1, |
| region: lsr.2, |
| ..Default::default() |
| } |
| } |
| } |
| |
| /// Convert from a [`LanguageIdentifier`] to an LSR tuple. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use icu::locid::{ |
| /// langid, subtags_language as language, subtags_region as region, |
| /// subtags_script as script, |
| /// }; |
| /// |
| /// let lid = langid!("en-Latn-US"); |
| /// let (lang, script, region) = (&lid).into(); |
| /// |
| /// assert_eq!(lang, language!("en")); |
| /// assert_eq!(script, Some(script!("Latn"))); |
| /// assert_eq!(region, Some(region!("US"))); |
| /// ``` |
| impl From<&LanguageIdentifier> |
| for ( |
| subtags::Language, |
| Option<subtags::Script>, |
| Option<subtags::Region>, |
| ) |
| { |
| fn from(langid: &LanguageIdentifier) -> Self { |
| (langid.language, langid.script, langid.region) |
| } |
| } |