blob: 3919719a7105e48ba7cf9f7d93f3a4b85d9fb744 [file] [log] [blame]
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains most of the actual algorithms for case mapping.
//!
//! Primarily, it implements methods on `CaseMap`, which contains the data model.
use crate::greek_to_me::{
self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData,
GreekVowel,
};
use crate::provider::data::{DotType, MappingKind};
use crate::provider::exception_helpers::ExceptionSlot;
use crate::provider::{CaseMap, CaseMapUnfold};
use crate::set::ClosureSink;
use crate::titlecase::TrailingCase;
use core::fmt;
use icu_locale_core::LanguageIdentifier;
use writeable::Writeable;
const ACUTE: char = '\u{301}';
// Used to control the behavior of CaseMapper::fold.
// Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
#[derive(Copy, Clone, Default)]
pub(crate) struct FoldOptions {
exclude_special_i: bool,
}
impl FoldOptions {
pub fn with_turkic_mappings() -> Self {
Self {
exclude_special_i: true,
}
}
}
/// Helper type that wraps a writeable in a prefix string
pub(crate) struct StringAndWriteable<'a, W> {
pub string: &'a str,
pub writeable: W,
}
impl<Wr: Writeable> Writeable for StringAndWriteable<'_, Wr> {
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
sink.write_str(self.string)?;
self.writeable.write_to(sink)
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint()
}
}
pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> {
data: &'a CaseMap<'a>,
src: &'a str,
locale: CaseMapLocale,
mapping: MappingKind,
titlecase_tail_casing: TrailingCase,
}
impl<const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'_, IS_TITLE_CONTEXT> {
#[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
let src = self.src;
let mut mapping = self.mapping;
let mut iter = src.char_indices();
for (i, c) in &mut iter {
let context = ContextIterator::new(&src[..i], &src[i..]);
self.data
.full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?;
if IS_TITLE_CONTEXT {
if self.titlecase_tail_casing == TrailingCase::Lower {
mapping = MappingKind::Lower;
} else {
break;
}
}
}
// Write the rest of the string
if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged {
sink.write_str(iter.as_str())?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
writeable::LengthHint::at_least(self.src.len())
}
}
impl<'data> CaseMap<'data> {
fn simple_helper(&self, c: char, kind: MappingKind) -> char {
let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_relevant_to(kind) {
let folded = c as i32 + data.delta() as i32;
// GIGO: delta should be valid
char::from_u32(folded as u32).unwrap_or(c)
} else {
c
}
} else {
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
if data.is_relevant_to(kind) {
if let Some(simple) = exception.get_simple_case_slot_for(c) {
return simple;
}
}
exception.slot_char_for_kind(kind).unwrap_or(c)
}
}
// Returns the lowercase mapping of the given `char`.
#[inline]
pub(crate) fn simple_lower(&self, c: char) -> char {
self.simple_helper(c, MappingKind::Lower)
}
// Returns the uppercase mapping of the given `char`.
#[inline]
pub(crate) fn simple_upper(&self, c: char) -> char {
self.simple_helper(c, MappingKind::Upper)
}
// Returns the titlecase mapping of the given `char`.
#[inline]
pub(crate) fn simple_title(&self, c: char) -> char {
self.simple_helper(c, MappingKind::Title)
}
// Return the simple case folding mapping of the given char.
#[inline]
pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char {
let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_upper_or_title() {
let folded = c as i32 + data.delta() as i32;
// GIGO: delta should be valid
char::from_u32(folded as u32).unwrap_or(c)
} else {
c
}
} else {
// TODO: if we move conditional fold and no_simple_case_folding into
// simple_helper, this function can just call simple_helper.
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
if exception.bits.has_conditional_fold() {
self.simple_fold_special_case(c, options)
} else if exception.bits.no_simple_case_folding() {
c
} else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) {
// unwrap_or case should never happen but best to avoid panics
exception.get_simple_case_slot_for(c).unwrap_or('\0')
} else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) {
slot_char
} else {
c
}
}
}
fn dot_type(&self, c: char) -> DotType {
let data = self.lookup_data(c);
if !data.has_exception() {
data.dot_type()
} else {
let idx = data.exception_index();
self.exceptions.get(idx).bits.dot_type()
}
}
// Returns true if this code point is is case-sensitive.
// This is not currently exposed.
#[allow(dead_code)]
fn is_case_sensitive(&self, c: char) -> bool {
let data = self.lookup_data(c);
if !data.has_exception() {
data.is_sensitive()
} else {
let idx = data.exception_index();
self.exceptions.get(idx).bits.is_sensitive()
}
}
/// Returns whether the character is cased
pub(crate) fn is_cased(&self, c: char) -> bool {
self.lookup_data(c).case_type().is_some()
}
#[inline(always)]
// IS_TITLE_CONTEXT must be true if kind is MappingKind::Title
// The kind may be a different kind with IS_TITLE_CONTEXT still true because
// titlecasing a segment involves switching to lowercase later
fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>(
&self,
c: char,
context: ContextIterator,
locale: CaseMapLocale,
kind: MappingKind,
sink: &mut W,
) -> fmt::Result {
// If using a title mapping IS_TITLE_CONTEXT must be true
debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT);
// In a title context, kind MUST be Title or Lower
debug_assert!(
!IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
);
// ICU4C's non-standard extension for Dutch IJ titlecasing
// handled here instead of in full_lower_special_case because J does not have conditional
// special casemapping.
if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
// When titlecasing, a J found immediately after an I at the beginning of the segment
// should also uppercase. They are both allowed to have an acute accent but it must
// be present on both letters or neither. They may not have any other combining marks.
if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
return sink.write_char('J');
}
}
// ICU4C's non-standard extension for Greek uppercasing:
// https://icu.unicode.org/design/case/greek-upper.
// Effectively removes Greek accents from Greek vowels during uppercasing,
// whilst attempting to preserve additional marks like the dialytika (diæresis)
// and ypogegrammeni (combining small iota).
if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper {
// Remove all combining diacritics on a Greek letter.
// Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into
// a capital iota).
// The dialytika is removed here, but it may be added again when the base letter is being processed.
if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c)
&& context.preceded_by_greek_letter()
{
return Ok(());
}
let data = greek_to_me::get_data(c);
// Check if the character is a Greek vowel
match data {
Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => {
// Get the diacritics on the character itself, and add any further combining diacritics
// from the context.
let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
// If the previous vowel had an accent (which would be removed) but no dialytika,
// and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
// the now-unaccented adjacent vowels from a digraph/diphthong.
// Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
// if the accent was combining, so as to map NFD to NFD and NFC to NFC.
if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ)
{
if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
if !preceding_vowel.combining.dialytika
&& !preceding_vowel.precomposed.dialytika
{
if preceding_vowel.combining.accented {
diacritics.dialytika = true;
} else {
precomposed_diacritics.dialytika =
preceding_vowel.precomposed.accented;
}
}
}
}
// Write the base of the uppercased combining character sequence.
// In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
// In some branches the base has a precomposed diacritic.
// In the case of the Greek disjunctive "or", a combining tonos may also be written.
match vowel {
GreekVowel::Η => {
// The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
// the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
//
// A lone η with an accent other than the oxia/tonos is not expected,
// so there is no need to special-case the oxia/tonos.
// The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
// so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
// (e.g. να είναι) since Byzantine times anyway.
if diacritics.accented
&& !context.followed_by_cased_letter(self)
&& !context.preceded_by_cased_letter(self)
&& !diacritics.ypogegrammeni
{
if precomposed_diacritics.accented {
sink.write_char('Ή')?;
} else {
sink.write_char('Η')?;
sink.write_char(greek_to_me::TONOS)?;
}
} else {
sink.write_char('Η')?;
}
}
GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϊ'
} else {
vowel.into()
})?,
GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
diacritics.dialytika = false;
'Ϋ'
} else {
vowel.into()
})?,
_ => sink.write_char(vowel.into())?,
};
if diacritics.dialytika {
sink.write_char(greek_to_me::DIALYTIKA)?;
}
if precomposed_diacritics.ypogegrammeni {
sink.write_char('Ι')?;
}
return Ok(());
}
// Rho might have breathing marks, we handle it specially
// to remove them
Some(GreekPrecomposedLetterData::Consonant(true)) => {
sink.write_char(greek_to_me::CAPITAL_RHO)?;
return Ok(());
}
_ => (),
}
}
let data = self.lookup_data(c);
if !data.has_exception() {
if data.is_relevant_to(kind) {
let mapped = c as i32 + data.delta() as i32;
// GIGO: delta should be valid
let mapped = char::from_u32(mapped as u32).unwrap_or(c);
sink.write_char(mapped)
} else {
sink.write_char(c)
}
} else {
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
if exception.bits.has_conditional_special() {
if let Some(special) = match kind {
MappingKind::Lower => {
self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale)
}
MappingKind::Fold => self.full_fold_special_case(c, context, locale),
MappingKind::Upper | MappingKind::Title => self
.full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale),
} {
return special.write_to(sink);
}
}
if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) {
if !mapped_string.is_empty() {
return sink.write_str(mapped_string);
}
}
if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() {
return sink.write_char(c);
}
if data.is_relevant_to(kind) {
if let Some(simple) = exception.get_simple_case_slot_for(c) {
return sink.write_char(simple);
}
}
if let Some(slot_char) = exception.slot_char_for_kind(kind) {
sink.write_char(slot_char)
} else {
sink.write_char(c)
}
}
}
// These constants are used for hardcoded locale-specific foldings.
const I_DOT: &'static str = "\u{69}\u{307}";
const J_DOT: &'static str = "\u{6a}\u{307}";
const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}";
const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}";
const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}";
const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}";
// Special case folding mappings, hardcoded.
// This handles the special Turkic mappings for uppercase I and dotted uppercase I
// For non-Turkic languages, this mapping is normally not used.
// For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char {
debug_assert!(c == '\u{49}' || c == '\u{130}');
let is_turkic = options.exclude_special_i;
match (c, is_turkic) {
// Turkic mappings
('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I
('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
// Default mappings
('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I
// There is no simple case folding for U+130.
(c, _) => c,
}
}
fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>(
&self,
c: char,
context: ContextIterator,
locale: CaseMapLocale,
) -> Option<FullMappingResult> {
if locale == CaseMapLocale::Lithuanian {
// Lithuanian retains the dot in a lowercase i when followed by accents.
// Introduce an explicit dot above when lowercasing capital I's and J's
// whenever there are more accents above (of the accents used in
// Lithuanian: grave, acute, and tilde above).
// Check for accents above I, J, and I-with-ogonek.
if c == 'I' && context.followed_by_more_above(self) {
return Some(FullMappingResult::String(Self::I_DOT));
} else if c == 'J' && context.followed_by_more_above(self) {
return Some(FullMappingResult::String(Self::J_DOT));
} else if c == '\u{12e}' && context.followed_by_more_above(self) {
return Some(FullMappingResult::String(Self::I_OGONEK_DOT));
}
// These characters are precomposed with accents above, so we don't
// have to look at the context.
if c == '\u{cc}' {
return Some(FullMappingResult::String(Self::I_DOT_GRAVE));
} else if c == '\u{cd}' {
return Some(FullMappingResult::String(Self::I_DOT_ACUTE));
} else if c == '\u{128}' {
return Some(FullMappingResult::String(Self::I_DOT_TILDE));
}
}
if locale == CaseMapLocale::Turkish {
if c == '\u{130}' {
// I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
return Some(FullMappingResult::CodePoint('i'));
} else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) {
// When lowercasing, remove dot_above in the sequence I + dot_above,
// which will turn into i. This matches the behaviour of the
// canonically equivalent I-dot_above.
//
// In a titlecase context, we do not want to apply this behavior to cases where the I
// was at the beginning of the string, as that I and its marks should be handled by the
// uppercasing rules (which ignore it, see below)
return Some(FullMappingResult::Remove);
} else if c == 'I' && !context.followed_by_dot_above(self) {
// When lowercasing, unless an I is before a dot_above, it turns
// into a dotless i.
return Some(FullMappingResult::CodePoint('\u{131}'));
}
}
if c == '\u{130}' {
// Preserve canonical equivalence for I with dot. Turkic is handled above.
return Some(FullMappingResult::String(Self::I_DOT));
}
if c == '\u{3a3}'
&& context.preceded_by_cased_letter(self)
&& !context.followed_by_cased_letter(self)
{
// Greek capital sigman maps depending on surrounding cased letters.
return Some(FullMappingResult::CodePoint('\u{3c2}'));
}
// No relevant special case mapping. Use a normal mapping.
None
}
fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>(
&self,
c: char,
context: ContextIterator,
locale: CaseMapLocale,
) -> Option<FullMappingResult> {
if locale == CaseMapLocale::Turkish && c == 'i' {
// In Turkic languages, i turns into a dotted capital I.
return Some(FullMappingResult::CodePoint('\u{130}'));
}
if locale == CaseMapLocale::Lithuanian
&& c == '\u{307}'
&& context.preceded_by_soft_dotted(self)
{
// Lithuanian retains the dot in a lowercase i when followed by accents.
// Remove dot_above after i with upper or titlecase.
return Some(FullMappingResult::Remove);
}
// ICU4C's non-standard extension for Armenian ligature ech-yiwn.
if c == '\u{587}' {
return match (locale, IS_TITLE_CONTEXT) {
(CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")),
(CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")),
(_, false) => Some(FullMappingResult::String("ԵՒ")),
(_, true) => Some(FullMappingResult::String("Եւ")),
};
}
None
}
fn full_fold_special_case(
&self,
c: char,
_context: ContextIterator,
locale: CaseMapLocale,
) -> Option<FullMappingResult> {
let is_turkic = locale == CaseMapLocale::Turkish;
match (c, is_turkic) {
// Turkic mappings
('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')),
('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')),
// Default mappings
('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')),
('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)),
(_, _) => None,
}
}
/// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists
/// to avoid perf impacts on other more common modes of operation
///
/// titlecase_tail_casing is only read in IS_TITLE_CONTEXT
pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>(
&'a self,
src: &'a str,
locale: CaseMapLocale,
mapping: MappingKind,
titlecase_tail_casing: TrailingCase,
) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
// Ensure that they are either both true or both false, i.e. an XNOR operation
debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title)));
FullCaseWriteable::<IS_TITLE_CONTEXT> {
data: self,
src,
locale,
mapping,
titlecase_tail_casing,
}
}
/// Adds all simple case mappings and the full case folding for `c` to `set`.
/// Also adds special case closure mappings.
/// The character itself is not added.
/// For example, the mappings
/// - for s include long s
/// - for sharp s include ss
/// - for k include the Kelvin sign
pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) {
// Hardcode the case closure of i and its relatives and ignore the
// data file data for these characters.
// The Turkic dotless i and dotted I with their case mapping conditions
// and case folding option make the related characters behave specially.
// This code matches their closure behavior to their case folding behavior.
match c {
// Regular i and I are in one equivalence class.
'\u{49}' => {
set.add_char('\u{69}');
return;
}
'\u{69}' => {
set.add_char('\u{49}');
return;
}
// Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>)
'\u{130}' => {
set.add_string(Self::I_DOT);
return;
}
// Dotless i is in a class by itself
'\u{131}' => {
return;
}
_ => {}
}
let data = self.lookup_data(c);
if !data.has_exception() {
if data.case_type().is_some() {
let delta = data.delta() as i32;
if delta != 0 {
// Add the one simple case mapping, no matter what type it is.
let codepoint = c as i32 + delta;
// GIGO: delta should be valid
let mapped = char::from_u32(codepoint as u32).unwrap_or(c);
set.add_char(mapped);
}
}
return;
}
// c has exceptions, so there may be multiple simple and/or full case mappings.
let idx = data.exception_index();
let exception = self.exceptions.get(idx);
// Add all simple case mappings.
for slot in [
ExceptionSlot::Lower,
ExceptionSlot::Fold,
ExceptionSlot::Upper,
ExceptionSlot::Title,
] {
if let Some(simple) = exception.get_char_slot(slot) {
set.add_char(simple);
}
}
if let Some(simple) = exception.get_simple_case_slot_for(c) {
set.add_char(simple);
}
exception.add_full_and_closure_mappings(set);
}
/// Maps the string to single code points and adds the associated case closure
/// mappings.
///
/// (see docs on CaseMapper::add_string_case_closure_to)
pub(crate) fn add_string_case_closure_to<S: ClosureSink>(
&self,
s: &str,
set: &mut S,
unfold_data: &CaseMapUnfold,
) -> bool {
if s.chars().count() <= 1 {
// The string is too short to find any match.
return false;
}
match unfold_data.get(s) {
Some(closure_string) => {
for c in closure_string.chars() {
set.add_char(c);
self.add_case_closure_to(c, set);
}
true
}
None => false,
}
}
}
// An internal representation of locale. Non-Root values of this
// enumeration imply that hard-coded special cases exist for this
// language.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum CaseMapLocale {
Root,
Turkish,
Lithuanian,
Greek,
Dutch,
Armenian,
}
impl CaseMapLocale {
pub const fn from_langid(langid: &LanguageIdentifier) -> Self {
use icu_locale_core::subtags::{language, Language};
const TR: Language = language!("tr");
const AZ: Language = language!("az");
const LT: Language = language!("lt");
const EL: Language = language!("el");
const NL: Language = language!("nl");
const HY: Language = language!("hy");
match langid.language {
TR | AZ => Self::Turkish,
LT => Self::Lithuanian,
EL => Self::Greek,
NL => Self::Dutch,
HY => Self::Armenian,
_ => Self::Root,
}
}
}
pub enum FullMappingResult<'a> {
Remove,
CodePoint(char),
String(&'a str),
}
impl FullMappingResult<'_> {
#[allow(dead_code)]
fn add_to_set<S: ClosureSink>(&self, set: &mut S) {
match *self {
FullMappingResult::CodePoint(c) => set.add_char(c),
FullMappingResult::String(s) => set.add_string(s),
FullMappingResult::Remove => {}
}
}
}
impl Writeable for FullMappingResult<'_> {
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
match *self {
FullMappingResult::CodePoint(c) => sink.write_char(c),
FullMappingResult::String(s) => sink.write_str(s),
FullMappingResult::Remove => Ok(()),
}
}
}
pub(crate) struct ContextIterator<'a> {
before: &'a str,
after: &'a str,
}
impl<'a> ContextIterator<'a> {
// Returns a context iterator with the characters before
// and after the character at a given index, given the preceding
// string and the succeeding string including the character itself
pub fn new(before: &'a str, char_and_after: &'a str) -> Self {
let mut char_and_after = char_and_after.chars();
char_and_after.next(); // skip the character itself
let after = char_and_after.as_str();
Self { before, after }
}
fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics {
diacritics.consume_greek_diacritics(self.after);
diacritics
}
fn preceded_by_greek_letter(&self) -> bool {
greek_to_me::preceded_by_greek_letter(self.before)
}
fn preceding_greek_vowel_diacritics(
&self,
) -> Option<GreekCombiningCharacterSequenceDiacritics> {
greek_to_me::preceding_greek_vowel_diacritics(self.before)
}
fn preceded_by_soft_dotted(&self, mapping: &CaseMap) -> bool {
for c in self.before.chars().rev() {
match mapping.dot_type(c) {
DotType::SoftDotted => return true,
DotType::OtherAccent => continue,
_ => return false,
}
}
false
}
/// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between.
///
/// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string
fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>(
&self,
mapping: &CaseMap,
) -> bool {
let mut iter = self.before.chars().rev();
while let Some(c) = iter.next() {
if c == 'I' {
if I_MUST_NOT_START_STRING {
return iter.next().is_some();
} else {
return true;
}
}
if mapping.dot_type(c) != DotType::OtherAccent {
break;
}
}
false
}
fn preceded_by_cased_letter(&self, mapping: &CaseMap) -> bool {
for c in self.before.chars().rev() {
let data = mapping.lookup_data(c);
if !data.is_ignorable() {
return data.case_type().is_some();
}
}
false
}
fn followed_by_cased_letter(&self, mapping: &CaseMap) -> bool {
for c in self.after.chars() {
let data = mapping.lookup_data(c);
if !data.is_ignorable() {
return data.case_type().is_some();
}
}
false
}
fn followed_by_more_above(&self, mapping: &CaseMap) -> bool {
for c in self.after.chars() {
match mapping.dot_type(c) {
DotType::Above => return true,
DotType::OtherAccent => continue,
_ => return false,
}
}
false
}
fn followed_by_dot_above(&self, mapping: &CaseMap) -> bool {
for c in self.after.chars() {
if c == '\u{307}' {
return true;
}
if mapping.dot_type(c) != DotType::OtherAccent {
return false;
}
}
false
}
/// Checks the preceding and surrounding context of a j or J
/// and returns true if it is preceded by an i or I at the start of the string.
/// If one has an acute accent,
/// both must have the accent for this to return true. No other accents are handled.
fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMap) -> bool {
let mut before = self.before.chars().rev();
let mut i_has_acute = false;
loop {
match before.next() {
Some('i') | Some('I') => break,
Some('í') | Some('Í') => {
i_has_acute = true;
break;
}
Some(ACUTE) => i_has_acute = true,
_ => return false,
}
}
if before.next().is_some() {
// not at the beginning of a string, doesn't matter
return false;
}
let mut j_has_acute = false;
for c in self.after.chars() {
if c == ACUTE {
j_has_acute = true;
continue;
}
// We are supposed to check that `j` has no other combining marks aside
// from potentially an acute accent. Once we hit the first non-combining mark
// we are done.
//
// ICU4C checks for `gc=Mn` to determine if something is a combining mark,
// however this requires extra data (and is the *only* point in the casemapping algorithm
// where there is a direct dependency on properties data not mediated by the casemapping data trie).
//
// Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
//
// See https://unicode-org.atlassian.net/browse/ICU-22429
match mapping.dot_type(c) {
// Not a combining character; ccc = 0
DotType::NoDot | DotType::SoftDotted => break,
// found combining character, bail
_ => return false,
}
}
// either both should have an acute accent, or none. this is an XNOR operation
!(j_has_acute ^ i_has_acute)
}
}