| /*! |
| Crate `utf8-ranges` converts ranges of Unicode scalar values to equivalent |
| ranges of UTF-8 bytes. This is useful for constructing byte based automatons |
| that need to embed UTF-8 decoding. |
| |
| See the documentation on the `Utf8Sequences` iterator for more details and |
| an example. |
| |
| # Wait, what is this? |
| |
| This is simplest to explain with an example. Let's say you wanted to test |
| whether a particular byte sequence was a Cyrillic character. One possible |
| scalar value range is `[0400-04FF]`. The set of allowed bytes for this |
| range can be expressed as a sequence of byte ranges: |
| |
| ```ignore |
| [D0-D3][80-BF] |
| ``` |
| |
| This is simple enough: simply encode the boundaries, `0400` encodes to |
| `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each |
| corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. |
| |
| However, what if you wanted to add the Cyrillic Supplementary characters to |
| your range? Your range might then become `[0400-052F]`. The same procedure |
| as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges |
| you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, |
| this isn't quite correct because this range doesn't capture many characters, |
| for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). |
| |
| Instead, you need multiple sequences of byte ranges: |
| |
| ```ignore |
| [D0-D3][80-BF] # matches codepoints 0400-04FF |
| [D4][80-AF] # matches codepoints 0500-052F |
| ``` |
| |
| This gets even more complicated if you want bigger ranges, particularly if |
| they naively contain surrogate codepoints. For example, the sequence of byte |
| ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: |
| |
| ```ignore |
| [0-7F] |
| [C2-DF][80-BF] |
| [E0][A0-BF][80-BF] |
| [E1-EC][80-BF][80-BF] |
| [ED][80-9F][80-BF] |
| [EE-EF][80-BF][80-BF] |
| ``` |
| |
| Note that the byte ranges above will *not* match any erroneous encoding of |
| UTF-8, including encodings of surrogate codepoints. |
| |
| And, of course, for all of Unicode (`[000000-10FFFF]`): |
| |
| ```ignore |
| [0-7F] |
| [C2-DF][80-BF] |
| [E0][A0-BF][80-BF] |
| [E1-EC][80-BF][80-BF] |
| [ED][80-9F][80-BF] |
| [EE-EF][80-BF][80-BF] |
| [F0][90-BF][80-BF][80-BF] |
| [F1-F3][80-BF][80-BF][80-BF] |
| [F4][80-8F][80-BF][80-BF] |
| ``` |
| |
| This crate automates the process of creating these byte ranges from ranges of |
| Unicode scalar values. |
| |
| # Why would I ever use this? |
| |
| You probably won't ever need this. In 99% of cases, you just decode the byte |
| sequence into a Unicode scalar value and compare scalar values directly. |
| However, this explicit decoding step isn't always possible. For example, the |
| construction of some finite state machines may benefit from converting ranges |
| of scalar values into UTF-8 decoder automata (e.g., for character classes in |
| regular expressions). |
| |
| # Lineage |
| |
| I got the idea and general implementation strategy from Russ Cox in his |
| [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. |
| Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). |
| I also got the idea from |
| [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), |
| which uses it for executing automata on their term index. |
| */ |
| |
| #![deny(missing_docs)] |
| |
| #[cfg(test)] |
| extern crate quickcheck; |
| #[cfg(test)] |
| #[macro_use] |
| extern crate doc_comment; |
| |
| #[cfg(test)] |
| doctest!("../README.md"); |
| |
| use std::char; |
| use std::fmt; |
| use std::slice; |
| |
| use char_utf8::encode_utf8; |
| |
| const MAX_UTF8_BYTES: usize = 4; |
| |
| mod char_utf8; |
| |
| /// Utf8Sequence represents a sequence of byte ranges. |
| /// |
| /// To match a Utf8Sequence, a candidate byte sequence must match each |
| /// successive range. |
| /// |
| /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte |
| /// sequence `\xDD\x61` would not match because `0x61 < 0x80`. |
| #[derive(Copy, Clone, Eq, PartialEq)] |
| pub enum Utf8Sequence { |
| /// One byte range. |
| One(Utf8Range), |
| /// Two successive byte ranges. |
| Two([Utf8Range; 2]), |
| /// Three successive byte ranges. |
| Three([Utf8Range; 3]), |
| /// Four successive byte ranges. |
| Four([Utf8Range; 4]), |
| } |
| |
| impl Utf8Sequence { |
| /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value |
| /// range. |
| /// |
| /// This assumes that `start` and `end` have the same length. |
| fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { |
| assert_eq!(start.len(), end.len()); |
| match start.len() { |
| 2 => Utf8Sequence::Two([ |
| Utf8Range::new(start[0], end[0]), |
| Utf8Range::new(start[1], end[1]), |
| ]), |
| 3 => Utf8Sequence::Three([ |
| Utf8Range::new(start[0], end[0]), |
| Utf8Range::new(start[1], end[1]), |
| Utf8Range::new(start[2], end[2]), |
| ]), |
| 4 => Utf8Sequence::Four([ |
| Utf8Range::new(start[0], end[0]), |
| Utf8Range::new(start[1], end[1]), |
| Utf8Range::new(start[2], end[2]), |
| Utf8Range::new(start[3], end[3]), |
| ]), |
| n => unreachable!("invalid encoded length: {}", n), |
| } |
| } |
| |
| /// Returns the underlying sequence of byte ranges as a slice. |
| pub fn as_slice(&self) -> &[Utf8Range] { |
| use self::Utf8Sequence::*; |
| match *self { |
| One(ref r) => unsafe { slice::from_raw_parts(r, 1) }, |
| Two(ref r) => &r[..], |
| Three(ref r) => &r[..], |
| Four(ref r) => &r[..], |
| } |
| } |
| |
| /// Returns the number of byte ranges in this sequence. |
| /// |
| /// The length is guaranteed to be in the closed interval `[1, 4]`. |
| pub fn len(&self) -> usize { |
| self.as_slice().len() |
| } |
| |
| /// Returns true if and only if a prefix of `bytes` matches this sequence |
| /// of byte ranges. |
| pub fn matches(&self, bytes: &[u8]) -> bool { |
| if bytes.len() < self.len() { |
| return false; |
| } |
| for (&b, r) in bytes.iter().zip(self) { |
| if !r.matches(b) { |
| return false; |
| } |
| } |
| true |
| } |
| } |
| |
| impl<'a> IntoIterator for &'a Utf8Sequence { |
| type IntoIter = slice::Iter<'a, Utf8Range>; |
| type Item = &'a Utf8Range; |
| |
| fn into_iter(self) -> Self::IntoIter { |
| self.as_slice().into_iter() |
| } |
| } |
| |
| impl fmt::Debug for Utf8Sequence { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| use self::Utf8Sequence::*; |
| match *self { |
| One(ref r) => write!(f, "{:?}", r), |
| Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), |
| Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), |
| Four(ref r) => write!(f, "{:?}{:?}{:?}{:?}", |
| r[0], r[1], r[2], r[3]), |
| } |
| } |
| } |
| |
| /// A single inclusive range of UTF-8 bytes. |
| #[derive(Clone, Copy, PartialEq, Eq)] |
| pub struct Utf8Range { |
| /// Start of byte range (inclusive). |
| pub start: u8, |
| /// End of byte range (inclusive). |
| pub end: u8, |
| } |
| |
| impl Utf8Range { |
| fn new(start: u8, end: u8) -> Self { |
| Utf8Range { start: start, end: end } |
| } |
| |
| /// Returns true if and only if the given byte is in this range. |
| pub fn matches(&self, b: u8) -> bool { |
| self.start <= b && b <= self.end |
| } |
| } |
| |
| impl fmt::Debug for Utf8Range { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| if self.start == self.end { |
| write!(f, "[{:X}]", self.start) |
| } else { |
| write!(f, "[{:X}-{:X}]", self.start, self.end) |
| } |
| } |
| } |
| |
| /// An iterator over ranges of matching UTF-8 byte sequences. |
| /// |
| /// The iteration represents an alternation of comprehensive byte sequences |
| /// that match precisely the set of UTF-8 encoded scalar values. |
| /// |
| /// A byte sequence corresponds to one of the scalar values in the range given |
| /// if and only if it completely matches exactly one of the sequences of byte |
| /// ranges produced by this iterator. |
| /// |
| /// Each sequence of byte ranges matches a unique set of bytes. That is, no two |
| /// sequences will match the same bytes. |
| /// |
| /// # Example |
| /// |
| /// This shows how to match an arbitrary byte sequence against a range of |
| /// scalar values. |
| /// |
| /// ```rust |
| /// use utf8_ranges::{Utf8Sequences, Utf8Sequence}; |
| /// |
| /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { |
| /// for range in seqs { |
| /// if range.matches(bytes) { |
| /// return true; |
| /// } |
| /// } |
| /// false |
| /// } |
| /// |
| /// // Test the basic multilingual plane. |
| /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); |
| /// |
| /// // UTF-8 encoding of 'a'. |
| /// assert!(matches(&seqs, &[0x61])); |
| /// // UTF-8 encoding of '☃' (`\u{2603}`). |
| /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); |
| /// // UTF-8 encoding of `\u{10348}` (outside the BMP). |
| /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); |
| /// // Tries to match against a UTF-8 encoding of a surrogate codepoint, |
| /// // which is invalid UTF-8, and therefore fails, despite the fact that |
| /// // the corresponding codepoint (0xD800) falls in the range given. |
| /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); |
| /// // And fails against plain old invalid UTF-8. |
| /// assert!(!matches(&seqs, &[0xFF, 0xFF])); |
| /// ``` |
| /// |
| /// If this example seems circuitous, that's because it is! It's meant to be |
| /// illustrative. In practice, you could just try to decode your byte sequence |
| /// and compare it with the scalar value range directly. However, this is not |
| /// always possible (for example, in a byte based automaton). |
| pub struct Utf8Sequences { |
| range_stack: Vec<ScalarRange>, |
| } |
| |
| impl Utf8Sequences { |
| /// Create a new iterator over UTF-8 byte ranges for the scalar value range |
| /// given. |
| pub fn new(start: char, end: char) -> Self { |
| let mut it = Utf8Sequences { range_stack: vec![] }; |
| it.push(start as u32, end as u32); |
| it |
| } |
| |
| /// reset resets the scalar value range. |
| /// Any existing state is cleared, but resources may be reused. |
| /// |
| /// N.B. Benchmarks say that this method is dubious. |
| #[doc(hidden)] |
| pub fn reset(&mut self, start: char, end: char) { |
| self.range_stack.clear(); |
| self.push(start as u32, end as u32); |
| } |
| |
| fn push(&mut self, start: u32, end: u32) { |
| self.range_stack.push(ScalarRange { start: start, end: end }); |
| } |
| } |
| |
| struct ScalarRange { |
| start: u32, |
| end: u32, |
| } |
| |
| impl fmt::Debug for ScalarRange { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) |
| } |
| } |
| |
| impl Iterator for Utf8Sequences { |
| type Item = Utf8Sequence; |
| |
| fn next(&mut self) -> Option<Self::Item> { |
| 'TOP: |
| while let Some(mut r) = self.range_stack.pop() { |
| 'INNER: |
| loop { |
| if let Some((r1, r2)) = r.split() { |
| self.push(r2.start, r2.end); |
| r.start = r1.start; |
| r.end = r1.end; |
| continue 'INNER; |
| } |
| if !r.is_valid() { |
| continue 'TOP; |
| } |
| for i in 1..MAX_UTF8_BYTES { |
| let max = max_scalar_value(i); |
| if r.start <= max && max < r.end { |
| self.push(max + 1, r.end); |
| r.end = max; |
| continue 'INNER; |
| } |
| } |
| if let Some(ascii_range) = r.as_ascii() { |
| return Some(Utf8Sequence::One(ascii_range)); |
| } |
| for i in 1..MAX_UTF8_BYTES { |
| let m = (1 << (6 * i)) - 1; |
| if (r.start & !m) != (r.end & !m) { |
| if (r.start & m) != 0 { |
| self.push((r.start | m) + 1, r.end); |
| r.end = r.start | m; |
| continue 'INNER; |
| } |
| if (r.end & m) != m { |
| self.push(r.end & !m, r.end); |
| r.end = (r.end & !m) - 1; |
| continue 'INNER; |
| } |
| } |
| } |
| let mut start = [0; MAX_UTF8_BYTES]; |
| let mut end = [0; MAX_UTF8_BYTES]; |
| let n = r.encode(&mut start, &mut end); |
| return Some(Utf8Sequence::from_encoded_range( |
| &start[0..n], &end[0..n])); |
| } |
| } |
| None |
| } |
| } |
| |
| impl ScalarRange { |
| /// split splits this range if it overlaps with a surrogate codepoint. |
| /// |
| /// Either or both ranges may be invalid. |
| fn split(&self) -> Option<(ScalarRange, ScalarRange)> { |
| if self.start < 0xE000 && self.end > 0xD7FF { |
| Some((ScalarRange { |
| start: self.start, |
| end: 0xD7FF, |
| }, ScalarRange { |
| start: 0xE000, |
| end: self.end, |
| })) |
| } else { |
| None |
| } |
| } |
| |
| /// is_valid returns true if and only if start <= end. |
| fn is_valid(&self) -> bool { |
| self.start <= self.end |
| } |
| |
| /// as_ascii returns this range as a Utf8Range if and only if all scalar |
| /// values in this range can be encoded as a single byte. |
| fn as_ascii(&self) -> Option<Utf8Range> { |
| if self.is_ascii() { |
| Some(Utf8Range::new(self.start as u8, self.end as u8)) |
| } else { |
| None |
| } |
| } |
| |
| /// is_ascii returns true if the range is ASCII only (i.e., takes a single |
| /// byte to encode any scalar value). |
| fn is_ascii(&self) -> bool { |
| self.is_valid() && self.end <= 0x7f |
| } |
| |
| /// encode writes the UTF-8 encoding of the start and end of this range |
| /// to the corresponding destination slices. |
| /// |
| /// The slices should have room for at least `MAX_UTF8_BYTES`. |
| fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { |
| let cs = char::from_u32(self.start).unwrap(); |
| let ce = char::from_u32(self.end).unwrap(); |
| let n = encode_utf8(cs, start).unwrap(); |
| let m = encode_utf8(ce, end).unwrap(); |
| assert_eq!(n, m); |
| n |
| } |
| } |
| |
| fn max_scalar_value(nbytes: usize) -> u32 { |
| match nbytes { |
| 1 => 0x007F, |
| 2 => 0x07FF, |
| 3 => 0xFFFF, |
| 4 => 0x10FFFF, |
| _ => unreachable!("invalid UTF-8 byte sequence size"), |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use std::char; |
| |
| use quickcheck::{TestResult, quickcheck}; |
| |
| use char_utf8::encode_utf8; |
| use {MAX_UTF8_BYTES, Utf8Range, Utf8Sequences}; |
| |
| fn rutf8(s: u8, e: u8) -> Utf8Range { |
| Utf8Range::new(s, e) |
| } |
| |
| fn never_accepts_surrogate_codepoints(start: char, end: char) { |
| let mut buf = [0; MAX_UTF8_BYTES]; |
| for cp in 0xD800..0xE000 { |
| let c = unsafe { ::std::mem::transmute(cp) }; |
| let n = encode_utf8(c, &mut buf).unwrap(); |
| for r in Utf8Sequences::new(start, end) { |
| if r.matches(&buf[0..n]) { |
| panic!("Sequence ({:X}, {:X}) contains range {:?}, \ |
| which matches surrogate code point {:X} \ |
| with encoded bytes {:?}", |
| start as u32, end as u32, r, cp, &buf[0..n]); |
| } |
| } |
| } |
| } |
| |
| #[test] |
| fn codepoints_no_surrogates() { |
| never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); |
| never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); |
| never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); |
| never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); |
| never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); |
| } |
| |
| #[test] |
| fn single_codepoint_one_sequence() { |
| // Tests that every range of scalar values that contains a single |
| // scalar value is recognized by one sequence of byte ranges. |
| for i in 0x0..(0x10FFFF + 1) { |
| let c = match char::from_u32(i) { |
| None => continue, |
| Some(c) => c, |
| }; |
| let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); |
| assert_eq!(seqs.len(), 1); |
| } |
| } |
| |
| #[test] |
| fn qc_codepoints_no_surrogate() { |
| fn p(s: char, e: char) -> TestResult { |
| if s > e { |
| return TestResult::discard(); |
| } |
| never_accepts_surrogate_codepoints(s, e); |
| TestResult::passed() |
| } |
| quickcheck(p as fn(char, char) -> TestResult); |
| } |
| |
| #[test] |
| fn bmp() { |
| use Utf8Sequence::*; |
| |
| let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}') |
| .collect::<Vec<_>>(); |
| assert_eq!(seqs, vec![ |
| One(rutf8(0x0, 0x7F)), |
| Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), |
| Three([rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF)]), |
| Three([rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]), |
| Three([rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF)]), |
| Three([rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]), |
| ]); |
| } |
| |
| #[test] |
| fn scratch() { |
| for range in Utf8Sequences::new('\u{0}', '\u{FFFF}') { |
| println!("{:?}", range); |
| } |
| } |
| } |