blob: dc44463108ab7e00be2923e460779b0022d60596 [file] [log] [blame]
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! The primary per-codepoint casefolding data
#[cfg(feature = "datagen")]
use alloc::collections::BTreeMap;
use core::num::TryFromIntError;
use icu_collections::codepointtrie::TrieValue;
use zerovec::ule::{AsULE, RawBytesULE, UleError, ULE};
/// The case of a Unicode character
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
pub enum CaseType {
/// Lowercase letter
Lower = 1,
/// Uppercase letter
Upper = 2,
/// Titlecase letter
Title = 3,
}
impl CaseType {
pub(crate) const CASE_MASK: u16 = 0x3;
// The casetype is stored in the codepoint trie as two bits.
// After masking them to get a value between 0 and 3, this
// function converts to `CaseType`.
//
// Returns `None` for uncased
#[inline]
pub(crate) fn from_masked_bits(b: u16) -> Option<Self> {
debug_assert!(b & Self::CASE_MASK == b);
match b {
0 => None,
1 => Some(CaseType::Lower),
2 => Some(CaseType::Upper),
_ => Some(CaseType::Title),
}
}
}
/// The dot type of a Unicode character. This indicates how dotted
/// letters (like `i` and `j`) combine with accents placed above the
/// letter.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
#[derive(Default)]
pub enum DotType {
/// Normal characters with combining class 0
#[default]
NoDot = 0,
/// Soft-dotted characters with combining class 0
SoftDotted = 1,
/// "Above" accents with combining class 230
Above = 2,
/// Other accent characters
OtherAccent = 3,
}
impl DotType {
pub(crate) const DOT_MASK: u16 = 0x3;
// The dot type is stored in either the codepoint trie or the
// exception table as two bits. After shifting and masking them
// to get a value between 0 and 3, this function converts to
// DotType.
#[inline]
pub(crate) fn from_masked_bits(b: u16) -> Self {
debug_assert!(b & Self::DOT_MASK == b);
match b {
0 => DotType::NoDot,
1 => DotType::SoftDotted,
2 => DotType::Above,
_ => DotType::OtherAccent,
}
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum MappingKind {
Lower = 0,
Fold = 1,
Upper = 2,
Title = 3,
}
/// Case mapping data associated with a single code point
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
pub struct CaseMapData {
/// Whether this is default-ignoreable
pub ignoreable: bool,
/// The rest of the case mapping data
pub kind: CaseMapDataKind,
}
/// A subset of case mapping data associated with a single code point
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum CaseMapDataKind {
/// This code point is an exception. Provides the case type of its own case
/// and the exception index stored in [`CaseMapExceptions`]
///
/// [`CaseMapExceptions`]: crate::provider::exceptions::CaseMapExceptions
Exception(Option<CaseType>, u16),
/// This code point is uncased, and has the following extra data
Uncased(NonExceptionData),
/// This code point is cased. We store the extra data, its case type, and a *delta*
/// that can be used to get its casemapped codepoint.
Delta(NonExceptionData, CaseType, i16),
}
/// Data that is stored in CaseMapData when it is *not* an exception
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::data))]
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct NonExceptionData {
/// Whether or not the type is case-sensitive
pub sensitive: bool,
/// The "dot type"
pub dot_type: DotType,
}
impl CaseMapData {
#[inline]
pub(crate) fn case_type(self) -> Option<CaseType> {
match self.kind {
CaseMapDataKind::Exception(case_type, ..) => case_type,
CaseMapDataKind::Delta(_, case_type, _) => Some(case_type),
CaseMapDataKind::Uncased(..) => None,
}
}
#[inline]
pub(crate) fn is_upper_or_title(self) -> bool {
match self.case_type() {
None | Some(CaseType::Lower) => false,
Some(CaseType::Upper) | Some(CaseType::Title) => true,
}
}
#[inline]
pub(crate) fn is_relevant_to(self, kind: MappingKind) -> bool {
match kind {
MappingKind::Lower | MappingKind::Fold => self.is_upper_or_title(),
MappingKind::Upper | MappingKind::Title => self.case_type() == Some(CaseType::Lower),
}
}
#[inline]
pub(crate) fn is_ignorable(self) -> bool {
self.ignoreable
}
#[inline]
pub(crate) fn has_exception(self) -> bool {
matches!(self.kind, CaseMapDataKind::Exception(..))
}
// Returns true if this code point is case-sensitive.
// only in the non-exception case
// This is not currently exposed.
#[inline]
pub(crate) fn is_sensitive(self) -> bool {
match self.kind {
CaseMapDataKind::Exception(..) => false,
CaseMapDataKind::Delta(ned, ..) => ned.sensitive,
CaseMapDataKind::Uncased(ned) => ned.sensitive,
}
}
#[inline]
pub(crate) fn dot_type(self) -> DotType {
match self.kind {
CaseMapDataKind::Exception(..) => DotType::NoDot,
CaseMapDataKind::Delta(ned, ..) => ned.dot_type,
CaseMapDataKind::Uncased(ned) => ned.dot_type,
}
}
// The delta between this code point and its upper/lowercase equivalent.
// This should only be called for codepoints without exception data.
//
// Returns 0 for uncased types
#[inline]
pub(crate) fn delta(self) -> i16 {
debug_assert!(!self.has_exception());
match self.kind {
CaseMapDataKind::Exception(..) => 0,
CaseMapDataKind::Delta(.., delta) => delta,
CaseMapDataKind::Uncased(..) => 0,
}
}
// The index of the exception data for this codepoint in the exception
// table. This should only be called for codepoints with exception data.
#[inline]
pub(crate) fn exception_index(self) -> u16 {
debug_assert!(self.has_exception());
if let CaseMapDataKind::Exception(_, i) = self.kind {
i
} else {
0
}
}
// CaseMapExceptionsBuilder moves the full mapping and closure
// strings out of the exception table itself. This means that the
// exception index for a code point in ICU4X will be different
// from the exception index for the same codepoint in ICU4C. Given
// a mapping from old to new, this function updates the exception
// index if necessary.
#[cfg(feature = "datagen")]
pub(crate) fn with_updated_exception(self, updates: &BTreeMap<u16, u16>) -> Self {
let kind = if let CaseMapDataKind::Exception(ty, index) = self.kind {
if let Some(updated_exception) = updates.get(&index) {
CaseMapDataKind::Exception(ty, *updated_exception)
} else {
self.kind
}
} else {
self.kind
};
Self { kind, ..self }
}
/// Attempt to construct from ICU-format integer
#[cfg(any(feature = "datagen", test))]
pub(crate) fn try_from_icu_integer(int: u16) -> Result<Self, UleError> {
let raw = int.to_unaligned();
CaseMapDataULE::validate_bytes(raw.as_bytes())?;
let this = Self::from_unaligned(CaseMapDataULE(raw));
Ok(this)
}
}
impl TrieValue for CaseMapData {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u16::try_from(i).map(|u| AsULE::from_unaligned(CaseMapDataULE(u.to_unaligned())))
}
fn to_u32(self) -> u32 {
u32::from(self.to_unaligned().0.as_unsigned_int())
}
}
/// Packed casemappingdata type
///
/// Data format, copied from ICU4C casepropsbuilder.cpp:
///
/// ```text
/// Trie data word:
/// Bits
/// if(exception) {
/// 15..4 unsigned exception index
/// } else {
/// if(not uncased) {
/// 15..7 signed delta to simple case mapping code point
/// (add delta to input code point)
/// } else {
/// 15..7 reserved, 0
/// }
/// 6..5 0 normal character with cc=0
/// 1 soft-dotted character
/// 2 cc=230
/// 3 other cc
/// The runtime code relies on these two bits to be adjacent with this encoding.
/// }
/// 4 case-sensitive
/// 3 exception
/// 2 case-ignorable
/// 1..0 0 uncased
/// 1 lowercase
/// 2 uppercase
/// 3 titlecase
/// The runtime code relies on the case-ignorable and case type bits 2..0
/// to be the lowest bits with this encoding.
/// ```
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(transparent)]
pub struct CaseMapDataULE(RawBytesULE<2>);
impl CaseMapDataULE {
// 1..0 case type
const CASE_TYPE_BITS: u16 = 0x3;
// 2 case-ignorable
const CASE_IGNOREABLE_BIT: u16 = 0x4;
// 3 exception
const EXCEPTION_BIT: u16 = 0x8;
// 4 case-sensitive
const CASE_SENSITIVE_BIT: u16 = 0x10;
// 15..4 unsigned exception index
const EXCEPTION_SHIFT: u16 = 4;
// 15..7 signed-delta to simple case mapping code point (or reserved)
const DELTA_SHIFT: u16 = 7;
// 6..5 dot type
const DOT_TYPE_BITS: u16 = 0x60;
const DOT_SHIFT: u16 = 5;
}
/// # Safety
///
/// Safety checklist for `ULE`:
///
/// 1. The type *must not* include any uninitialized or padding bytes: repr(transparent)
/// wrapper around ULE type
/// 2. The type must have an alignment of 1 byte: repr(transparent) wrapper around ULE type
/// 3. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice
/// would not represent a valid slice of this type: It does
/// 4. The impl of [`ULE::validate_bytes()`] *must* return an error if the given byte slice
/// cannot be used in its entirety (if its length is not a multiple of `size_of::<Self>()`):
/// it does, due to the RawBytesULE parse call
/// 5. All other methods *must* be left with their default impl, or else implemented according to
/// their respective safety guidelines: They have been
/// 6. The equality invariant is satisfied
unsafe impl ULE for CaseMapDataULE {
fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
let sixteens = RawBytesULE::<2>::parse_bytes_to_slice(bytes)?;
for sixteen in sixteens {
let sixteen = sixteen.as_unsigned_int();
// The type has reserved bits in the
// uncased + not exception case
if sixteen & Self::EXCEPTION_BIT == 0 {
// not an exception
if sixteen & Self::CASE_TYPE_BITS == 0 {
// uncased
if sixteen >> Self::DELTA_SHIFT != 0 {
// We have some used bits in the reserved zone!
return Err(UleError::parse::<Self>());
}
}
}
}
Ok(())
}
}
impl AsULE for CaseMapData {
type ULE = CaseMapDataULE;
fn from_unaligned(ule: Self::ULE) -> Self {
let sixteen = ule.0.as_unsigned_int();
let ignoreable = (sixteen & CaseMapDataULE::CASE_IGNOREABLE_BIT) != 0;
let exception = (sixteen & CaseMapDataULE::EXCEPTION_BIT) != 0;
let case_type = sixteen & CaseMapDataULE::CASE_TYPE_BITS;
let case_type = CaseType::from_masked_bits(case_type);
let kind = if exception {
// No need to mask first since the exception bits start at 15
let exception = sixteen >> CaseMapDataULE::EXCEPTION_SHIFT;
CaseMapDataKind::Exception(case_type, exception)
} else {
let dot_type = (sixteen & CaseMapDataULE::DOT_TYPE_BITS) >> CaseMapDataULE::DOT_SHIFT;
let dot_type = DotType::from_masked_bits(dot_type);
let sensitive = (sixteen & CaseMapDataULE::CASE_SENSITIVE_BIT) != 0;
let ned = NonExceptionData {
dot_type,
sensitive,
};
if let Some(case_type) = case_type {
// no need to mask first since the delta bits start at 15
// We can also cast as i16 first so we do not have to
// sign-extend later
let delta = (sixteen as i16) >> CaseMapDataULE::DELTA_SHIFT;
CaseMapDataKind::Delta(ned, case_type, delta)
} else {
CaseMapDataKind::Uncased(ned)
}
};
CaseMapData { ignoreable, kind }
}
fn to_unaligned(self) -> Self::ULE {
let mut sixteen = 0;
if self.ignoreable {
sixteen |= CaseMapDataULE::CASE_IGNOREABLE_BIT;
}
match self.kind {
CaseMapDataKind::Exception(case_type, e) => {
sixteen |= CaseMapDataULE::EXCEPTION_BIT;
sixteen |= e << CaseMapDataULE::EXCEPTION_SHIFT;
sixteen |= case_type.map(|c| c as u16).unwrap_or(0);
}
CaseMapDataKind::Uncased(ned) => {
sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT;
if ned.sensitive {
sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT;
}
// Remaining bytes are left at zero
// case_type is Uncased (0)
}
CaseMapDataKind::Delta(ned, case_type, delta) => {
// First shift (which keeps the signedness), then cast to the
// right type
sixteen |= (delta << CaseMapDataULE::DELTA_SHIFT) as u16;
sixteen |= (ned.dot_type as u16) << CaseMapDataULE::DOT_SHIFT;
if ned.sensitive {
sixteen |= CaseMapDataULE::CASE_SENSITIVE_BIT;
}
sixteen |= case_type as u16;
}
}
CaseMapDataULE(sixteen.to_unaligned())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_roundtrip() {
const TESTCASES: &[CaseMapData] = &[
CaseMapData {
ignoreable: true,
kind: CaseMapDataKind::Exception(Some(CaseType::Title), 923),
},
CaseMapData {
ignoreable: false,
kind: CaseMapDataKind::Exception(None, 923),
},
CaseMapData {
ignoreable: true,
kind: CaseMapDataKind::Delta(
NonExceptionData {
sensitive: true,
dot_type: DotType::SoftDotted,
},
CaseType::Upper,
50,
),
},
CaseMapData {
ignoreable: false,
kind: CaseMapDataKind::Delta(
NonExceptionData {
sensitive: true,
dot_type: DotType::SoftDotted,
},
CaseType::Upper,
-50,
),
},
CaseMapData {
ignoreable: false,
kind: CaseMapDataKind::Uncased(NonExceptionData {
sensitive: false,
dot_type: DotType::SoftDotted,
}),
},
];
for case in TESTCASES {
let ule = case.to_unaligned();
let roundtrip = CaseMapData::from_unaligned(ule);
assert_eq!(*case, roundtrip);
let integer = ule.0.as_unsigned_int();
let roundtrip2 = CaseMapData::try_from_icu_integer(integer).unwrap();
assert_eq!(*case, roundtrip2);
}
}
#[test]
fn test_integer_roundtrip() {
// Buggy roundtrip cases go here
fn test_single_integer(int: u16) {
let cmd = CaseMapData::try_from_icu_integer(int).unwrap();
assert_eq!(int, cmd.to_unaligned().0.as_unsigned_int())
}
test_single_integer(84);
test_single_integer(2503);
}
}