| use core::{char, cmp, fmt, str}; |
| |
| use crate::{ascii, bstr::BStr, ext_slice::ByteSlice}; |
| |
| // The UTF-8 decoder provided here is based on the one presented here: |
| // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
| // |
| // We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}` |
| // using regex-automata that is roughly the same size. The real benefit of |
| // Hoehrmann's formulation is that the byte class mapping below is manually |
| // tailored such that each byte's class doubles as a shift to mask out the |
| // bits necessary for constructing the leading bits of each codepoint value |
| // from the initial byte. |
| // |
| // There are some minor differences between this implementation and Hoehrmann's |
| // formulation. |
| // |
| // Firstly, we make REJECT have state ID 0, since it makes the state table |
| // itself a little easier to read and is consistent with the notion that 0 |
| // means "false" or "bad." |
| // |
| // Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast |
| // path. |
| // |
| // Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction |
| // in the core decoding loop. (Which is what regex-automata would do by |
| // default.) |
| // |
| // Fourthly, we split the byte class mapping and transition table into two |
| // arrays because it's clearer. |
| // |
| // It is unlikely that this is the fastest way to do UTF-8 decoding, however, |
| // it is fairly simple. |
| |
| const ACCEPT: usize = 12; |
| const REJECT: usize = 0; |
| |
| /// SAFETY: The decode below function relies on the correctness of these |
| /// equivalence classes. |
| #[cfg_attr(rustfmt, rustfmt::skip)] |
| const CLASSES: [u8; 256] = [ |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
| 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
| 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
| ]; |
| |
| /// SAFETY: The decode below function relies on the correctness of this state |
| /// machine. |
| #[cfg_attr(rustfmt, rustfmt::skip)] |
| const STATES_FORWARD: &'static [u8] = &[ |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, |
| 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, |
| 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, |
| 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
| 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, |
| 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| ]; |
| |
| /// An iterator over Unicode scalar values in a byte string. |
| /// |
| /// When invalid UTF-8 byte sequences are found, they are substituted with the |
| /// Unicode replacement codepoint (`U+FFFD`) using the |
| /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). |
| /// |
| /// This iterator is created by the |
| /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the |
| /// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. |
| #[derive(Clone, Debug)] |
| pub struct Chars<'a> { |
| bs: &'a [u8], |
| } |
| |
| impl<'a> Chars<'a> { |
| pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> { |
| Chars { bs } |
| } |
| |
| /// View the underlying data as a subslice of the original data. |
| /// |
| /// The slice returned has the same lifetime as the original slice, and so |
| /// the iterator can continue to be used while this exists. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut chars = b"abc".chars(); |
| /// |
| /// assert_eq!(b"abc", chars.as_bytes()); |
| /// chars.next(); |
| /// assert_eq!(b"bc", chars.as_bytes()); |
| /// chars.next(); |
| /// chars.next(); |
| /// assert_eq!(b"", chars.as_bytes()); |
| /// ``` |
| #[inline] |
| pub fn as_bytes(&self) -> &'a [u8] { |
| self.bs |
| } |
| } |
| |
| impl<'a> Iterator for Chars<'a> { |
| type Item = char; |
| |
| #[inline] |
| fn next(&mut self) -> Option<char> { |
| let (ch, size) = decode_lossy(self.bs); |
| if size == 0 { |
| return None; |
| } |
| self.bs = &self.bs[size..]; |
| Some(ch) |
| } |
| } |
| |
| impl<'a> DoubleEndedIterator for Chars<'a> { |
| #[inline] |
| fn next_back(&mut self) -> Option<char> { |
| let (ch, size) = decode_last_lossy(self.bs); |
| if size == 0 { |
| return None; |
| } |
| self.bs = &self.bs[..self.bs.len() - size]; |
| Some(ch) |
| } |
| } |
| |
| /// An iterator over Unicode scalar values in a byte string and their |
| /// byte index positions. |
| /// |
| /// When invalid UTF-8 byte sequences are found, they are substituted with the |
| /// Unicode replacement codepoint (`U+FFFD`) using the |
| /// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). |
| /// |
| /// Note that this is slightly different from the `CharIndices` iterator |
| /// provided by the standard library. Aside from working on possibly invalid |
| /// UTF-8, this iterator provides both the corresponding starting and ending |
| /// byte indices of each codepoint yielded. The ending position is necessary to |
| /// slice the original byte string when invalid UTF-8 bytes are converted into |
| /// a Unicode replacement codepoint, since a single replacement codepoint can |
| /// substitute anywhere from 1 to 3 invalid bytes (inclusive). |
| /// |
| /// This iterator is created by the |
| /// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided |
| /// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`. |
| #[derive(Clone, Debug)] |
| pub struct CharIndices<'a> { |
| bs: &'a [u8], |
| forward_index: usize, |
| reverse_index: usize, |
| } |
| |
| impl<'a> CharIndices<'a> { |
| pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { |
| CharIndices { bs, forward_index: 0, reverse_index: bs.len() } |
| } |
| |
| /// View the underlying data as a subslice of the original data. |
| /// |
| /// The slice returned has the same lifetime as the original slice, and so |
| /// the iterator can continue to be used while this exists. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut it = b"abc".char_indices(); |
| /// |
| /// assert_eq!(b"abc", it.as_bytes()); |
| /// it.next(); |
| /// assert_eq!(b"bc", it.as_bytes()); |
| /// it.next(); |
| /// it.next(); |
| /// assert_eq!(b"", it.as_bytes()); |
| /// ``` |
| #[inline] |
| pub fn as_bytes(&self) -> &'a [u8] { |
| self.bs |
| } |
| } |
| |
| impl<'a> Iterator for CharIndices<'a> { |
| type Item = (usize, usize, char); |
| |
| #[inline] |
| fn next(&mut self) -> Option<(usize, usize, char)> { |
| let index = self.forward_index; |
| let (ch, size) = decode_lossy(self.bs); |
| if size == 0 { |
| return None; |
| } |
| self.bs = &self.bs[size..]; |
| self.forward_index += size; |
| Some((index, index + size, ch)) |
| } |
| } |
| |
| impl<'a> DoubleEndedIterator for CharIndices<'a> { |
| #[inline] |
| fn next_back(&mut self) -> Option<(usize, usize, char)> { |
| let (ch, size) = decode_last_lossy(self.bs); |
| if size == 0 { |
| return None; |
| } |
| self.bs = &self.bs[..self.bs.len() - size]; |
| self.reverse_index -= size; |
| Some((self.reverse_index, self.reverse_index + size, ch)) |
| } |
| } |
| |
| impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {} |
| |
| /// An iterator over chunks of valid UTF-8 in a byte slice. |
| /// |
| /// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks). |
| #[derive(Clone, Debug)] |
| pub struct Utf8Chunks<'a> { |
| pub(super) bytes: &'a [u8], |
| } |
| |
| /// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes. |
| /// |
| /// This is yielded by the |
| /// [`Utf8Chunks`](struct.Utf8Chunks.html) |
| /// iterator, which can be created via the |
| /// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks) |
| /// method. |
| /// |
| /// The `'a` lifetime parameter corresponds to the lifetime of the bytes that |
| /// are being iterated over. |
| #[cfg_attr(test, derive(Debug, PartialEq))] |
| pub struct Utf8Chunk<'a> { |
| /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes. |
| /// |
| /// This is empty between adjacent invalid UTF-8 byte sequences. |
| valid: &'a str, |
| /// A sequence of invalid UTF-8 bytes. |
| /// |
| /// Can only be empty in the last chunk. |
| /// |
| /// Should be replaced by a single unicode replacement character, if not |
| /// empty. |
| invalid: &'a BStr, |
| /// Indicates whether the invalid sequence could've been valid if there |
| /// were more bytes. |
| /// |
| /// Can only be true in the last chunk. |
| incomplete: bool, |
| } |
| |
| impl<'a> Utf8Chunk<'a> { |
| /// Returns the (possibly empty) valid UTF-8 bytes in this chunk. |
| /// |
| /// This may be empty if there are consecutive sequences of invalid UTF-8 |
| /// bytes. |
| #[inline] |
| pub fn valid(&self) -> &'a str { |
| self.valid |
| } |
| |
| /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that |
| /// immediately follow the valid UTF-8 bytes in this chunk. |
| /// |
| /// This is only empty when this chunk corresponds to the last chunk in |
| /// the original bytes. |
| /// |
| /// The maximum length of this slice is 3. That is, invalid UTF-8 byte |
| /// sequences greater than 1 always correspond to a valid _prefix_ of |
| /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution |
| /// of maximal subparts" strategy that is described in more detail in the |
| /// docs for the |
| /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) |
| /// method. |
| #[inline] |
| pub fn invalid(&self) -> &'a [u8] { |
| self.invalid.as_bytes() |
| } |
| |
| /// Returns whether the invalid sequence might still become valid if more |
| /// bytes are added. |
| /// |
| /// Returns true if the end of the input was reached unexpectedly, |
| /// without encountering an unexpected byte. |
| /// |
| /// This can only be the case for the last chunk. |
| #[inline] |
| pub fn incomplete(&self) -> bool { |
| self.incomplete |
| } |
| } |
| |
| impl<'a> Iterator for Utf8Chunks<'a> { |
| type Item = Utf8Chunk<'a>; |
| |
| #[inline] |
| fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
| if self.bytes.is_empty() { |
| return None; |
| } |
| match validate(self.bytes) { |
| Ok(()) => { |
| let valid = self.bytes; |
| self.bytes = &[]; |
| Some(Utf8Chunk { |
| // SAFETY: This is safe because of the guarantees provided |
| // by utf8::validate. |
| valid: unsafe { str::from_utf8_unchecked(valid) }, |
| invalid: [].as_bstr(), |
| incomplete: false, |
| }) |
| } |
| Err(e) => { |
| let (valid, rest) = self.bytes.split_at(e.valid_up_to()); |
| // SAFETY: This is safe because of the guarantees provided by |
| // utf8::validate. |
| let valid = unsafe { str::from_utf8_unchecked(valid) }; |
| let (invalid_len, incomplete) = match e.error_len() { |
| Some(n) => (n, false), |
| None => (rest.len(), true), |
| }; |
| let (invalid, rest) = rest.split_at(invalid_len); |
| self.bytes = rest; |
| Some(Utf8Chunk { |
| valid, |
| invalid: invalid.as_bstr(), |
| incomplete, |
| }) |
| } |
| } |
| } |
| |
| #[inline] |
| fn size_hint(&self) -> (usize, Option<usize>) { |
| if self.bytes.is_empty() { |
| (0, Some(0)) |
| } else { |
| (1, Some(self.bytes.len())) |
| } |
| } |
| } |
| |
| impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {} |
| |
| /// An error that occurs when UTF-8 decoding fails. |
| /// |
| /// This error occurs when attempting to convert a non-UTF-8 byte |
| /// string to a Rust string that must be valid UTF-8. For example, |
| /// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method. |
| /// |
| /// # Example |
| /// |
| /// This example shows what happens when a given byte sequence is invalid, |
| /// but ends with a sequence that is a possible prefix of valid UTF-8. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"foobar\xF1\x80\x80"); |
| /// let err = s.to_str().unwrap_err(); |
| /// assert_eq!(err.valid_up_to(), 6); |
| /// assert_eq!(err.error_len(), None); |
| /// ``` |
| /// |
| /// This example shows what happens when a given byte sequence contains |
| /// invalid UTF-8. |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foobar\xF1\x80\x80quux"; |
| /// let err = s.to_str().unwrap_err(); |
| /// assert_eq!(err.valid_up_to(), 6); |
| /// // The error length reports the maximum number of bytes that correspond to |
| /// // a valid prefix of a UTF-8 encoded codepoint. |
| /// assert_eq!(err.error_len(), Some(3)); |
| /// |
| /// // In contrast to the above which contains a single invalid prefix, |
| /// // consider the case of multiple individual bytes that are never valid |
| /// // prefixes. Note how the value of error_len changes! |
| /// let s = b"foobar\xFF\xFFquux"; |
| /// let err = s.to_str().unwrap_err(); |
| /// assert_eq!(err.valid_up_to(), 6); |
| /// assert_eq!(err.error_len(), Some(1)); |
| /// |
| /// // The fact that it's an invalid prefix does not change error_len even |
| /// // when it immediately precedes the end of the string. |
| /// let s = b"foobar\xFF"; |
| /// let err = s.to_str().unwrap_err(); |
| /// assert_eq!(err.valid_up_to(), 6); |
| /// assert_eq!(err.error_len(), Some(1)); |
| /// ``` |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub struct Utf8Error { |
| valid_up_to: usize, |
| error_len: Option<usize>, |
| } |
| |
| impl Utf8Error { |
| /// Returns the byte index of the position immediately following the last |
| /// valid UTF-8 byte. |
| /// |
| /// # Example |
| /// |
| /// This examples shows how `valid_up_to` can be used to retrieve a |
| /// possibly empty prefix that is guaranteed to be valid UTF-8: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foobar\xF1\x80\x80quux"; |
| /// let err = s.to_str().unwrap_err(); |
| /// |
| /// // This is guaranteed to never panic. |
| /// let string = s[..err.valid_up_to()].to_str().unwrap(); |
| /// assert_eq!(string, "foobar"); |
| /// ``` |
| #[inline] |
| pub fn valid_up_to(&self) -> usize { |
| self.valid_up_to |
| } |
| |
| /// Returns the total number of invalid UTF-8 bytes immediately following |
| /// the position returned by `valid_up_to`. This value is always at least |
| /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8 |
| /// encoded codepoint. |
| /// |
| /// If the end of the original input was found before a valid UTF-8 encoded |
| /// codepoint could be completed, then this returns `None`. This is useful |
| /// when processing streams, where a `None` value signals that more input |
| /// might be needed. |
| #[inline] |
| pub fn error_len(&self) -> Option<usize> { |
| self.error_len |
| } |
| } |
| |
| #[cfg(feature = "std")] |
| impl std::error::Error for Utf8Error { |
| fn description(&self) -> &str { |
| "invalid UTF-8" |
| } |
| } |
| |
| impl fmt::Display for Utf8Error { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to) |
| } |
| } |
| |
| /// Returns OK if and only if the given slice is completely valid UTF-8. |
| /// |
| /// If the slice isn't valid UTF-8, then an error is returned that explains |
| /// the first location at which invalid UTF-8 was detected. |
| pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> { |
| // The fast path for validating UTF-8. It steps through a UTF-8 automaton |
| // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is |
| // detected, it backs up and runs the slower version of the UTF-8 automaton |
| // to determine correct error information. |
| fn fast(slice: &[u8]) -> Result<(), Utf8Error> { |
| let mut state = ACCEPT; |
| let mut i = 0; |
| |
| while i < slice.len() { |
| let b = slice[i]; |
| |
| // ASCII fast path. If we see two consecutive ASCII bytes, then try |
| // to validate as much ASCII as possible very quickly. |
| if state == ACCEPT |
| && b <= 0x7F |
| && slice.get(i + 1).map_or(false, |&b| b <= 0x7F) |
| { |
| i += ascii::first_non_ascii_byte(&slice[i..]); |
| continue; |
| } |
| |
| state = step(state, b); |
| if state == REJECT { |
| return Err(find_valid_up_to(slice, i)); |
| } |
| i += 1; |
| } |
| if state != ACCEPT { |
| Err(find_valid_up_to(slice, slice.len())) |
| } else { |
| Ok(()) |
| } |
| } |
| |
| // Given the first position at which a UTF-8 sequence was determined to be |
| // invalid, return an error that correctly reports the position at which |
| // the last complete UTF-8 sequence ends. |
| #[inline(never)] |
| fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error { |
| // In order to find the last valid byte, we need to back up an amount |
| // that guarantees every preceding byte is part of a valid UTF-8 |
| // code unit sequence. To do this, we simply locate the last leading |
| // byte that occurs before rejected_at. |
| let mut backup = rejected_at.saturating_sub(1); |
| while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) { |
| backup -= 1; |
| } |
| let upto = cmp::min(slice.len(), rejected_at.saturating_add(1)); |
| let mut err = slow(&slice[backup..upto]).unwrap_err(); |
| err.valid_up_to += backup; |
| err |
| } |
| |
| // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error |
| // when an invalid sequence is found. This is split out from validate so |
| // that the fast path doesn't need to keep track of the position of the |
| // last valid UTF-8 byte. In particular, tracking this requires checking |
| // for an ACCEPT state on each byte, which degrades throughput pretty |
| // badly. |
| fn slow(slice: &[u8]) -> Result<(), Utf8Error> { |
| let mut state = ACCEPT; |
| let mut valid_up_to = 0; |
| for (i, &b) in slice.iter().enumerate() { |
| state = step(state, b); |
| if state == ACCEPT { |
| valid_up_to = i + 1; |
| } else if state == REJECT { |
| // Our error length must always be at least 1. |
| let error_len = Some(cmp::max(1, i - valid_up_to)); |
| return Err(Utf8Error { valid_up_to, error_len }); |
| } |
| } |
| if state != ACCEPT { |
| Err(Utf8Error { valid_up_to, error_len: None }) |
| } else { |
| Ok(()) |
| } |
| } |
| |
| // Advance to the next state given the current state and current byte. |
| fn step(state: usize, b: u8) -> usize { |
| let class = CLASSES[b as usize]; |
| // SAFETY: This is safe because 'class' is always <=11 and 'state' is |
| // always <=96. Therefore, the maximal index is 96+11 = 107, where |
| // STATES_FORWARD.len() = 108 such that every index is guaranteed to be |
| // valid by construction of the state machine and the byte equivalence |
| // classes. |
| unsafe { |
| *STATES_FORWARD.get_unchecked(state + class as usize) as usize |
| } |
| } |
| |
| fast(slice) |
| } |
| |
| /// UTF-8 decode a single Unicode scalar value from the beginning of a slice. |
| /// |
| /// When successful, the corresponding Unicode scalar value is returned along |
| /// with the number of bytes it was encoded with. The number of bytes consumed |
| /// for a successful decode is always between 1 and 4, inclusive. |
| /// |
| /// When unsuccessful, `None` is returned along with the number of bytes that |
| /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, |
| /// the number of bytes consumed is always between 0 and 3, inclusive, where |
| /// 0 is only returned when `slice` is empty. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::decode_utf8; |
| /// |
| /// // Decoding a valid codepoint. |
| /// let (ch, size) = decode_utf8(b"\xE2\x98\x83"); |
| /// assert_eq!(Some('☃'), ch); |
| /// assert_eq!(3, size); |
| /// |
| /// // Decoding an incomplete codepoint. |
| /// let (ch, size) = decode_utf8(b"\xE2\x98"); |
| /// assert_eq!(None, ch); |
| /// assert_eq!(2, size); |
| /// ``` |
| /// |
| /// This example shows how to iterate over all codepoints in UTF-8 encoded |
| /// bytes, while replacing invalid UTF-8 sequences with the replacement |
| /// codepoint: |
| /// |
| /// ``` |
| /// use bstr::{B, decode_utf8}; |
| /// |
| /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
| /// let mut chars = vec![]; |
| /// while !bytes.is_empty() { |
| /// let (ch, size) = decode_utf8(bytes); |
| /// bytes = &bytes[size..]; |
| /// chars.push(ch.unwrap_or('\u{FFFD}')); |
| /// } |
| /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); |
| /// ``` |
| #[inline] |
| pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { |
| let slice = slice.as_ref(); |
| match slice.get(0) { |
| None => return (None, 0), |
| Some(&b) if b <= 0x7F => return (Some(b as char), 1), |
| _ => {} |
| } |
| |
| let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); |
| while i < slice.len() { |
| decode_step(&mut state, &mut cp, slice[i]); |
| i += 1; |
| |
| if state == ACCEPT { |
| // SAFETY: This is safe because `decode_step` guarantees that |
| // `cp` is a valid Unicode scalar value in an ACCEPT state. |
| let ch = unsafe { char::from_u32_unchecked(cp) }; |
| return (Some(ch), i); |
| } else if state == REJECT { |
| // At this point, we always want to advance at least one byte. |
| return (None, cmp::max(1, i.saturating_sub(1))); |
| } |
| } |
| (None, i) |
| } |
| |
| /// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a |
| /// slice. |
| /// |
| /// When successful, the corresponding Unicode scalar value is returned along |
| /// with the number of bytes it was encoded with. The number of bytes consumed |
| /// for a successful decode is always between 1 and 4, inclusive. |
| /// |
| /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned |
| /// along with the number of bytes that make up a maximal prefix of a valid |
| /// UTF-8 code unit sequence. In this case, the number of bytes consumed is |
| /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is |
| /// empty. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ```ignore |
| /// use bstr::decode_utf8_lossy; |
| /// |
| /// // Decoding a valid codepoint. |
| /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83"); |
| /// assert_eq!('☃', ch); |
| /// assert_eq!(3, size); |
| /// |
| /// // Decoding an incomplete codepoint. |
| /// let (ch, size) = decode_utf8_lossy(b"\xE2\x98"); |
| /// assert_eq!('\u{FFFD}', ch); |
| /// assert_eq!(2, size); |
| /// ``` |
| /// |
| /// This example shows how to iterate over all codepoints in UTF-8 encoded |
| /// bytes, while replacing invalid UTF-8 sequences with the replacement |
| /// codepoint: |
| /// |
| /// ```ignore |
| /// use bstr::{B, decode_utf8_lossy}; |
| /// |
| /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
| /// let mut chars = vec![]; |
| /// while !bytes.is_empty() { |
| /// let (ch, size) = decode_utf8_lossy(bytes); |
| /// bytes = &bytes[size..]; |
| /// chars.push(ch); |
| /// } |
| /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); |
| /// ``` |
| #[inline] |
| pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { |
| match decode(slice) { |
| (Some(ch), size) => (ch, size), |
| (None, size) => ('\u{FFFD}', size), |
| } |
| } |
| |
| /// UTF-8 decode a single Unicode scalar value from the end of a slice. |
| /// |
| /// When successful, the corresponding Unicode scalar value is returned along |
| /// with the number of bytes it was encoded with. The number of bytes consumed |
| /// for a successful decode is always between 1 and 4, inclusive. |
| /// |
| /// When unsuccessful, `None` is returned along with the number of bytes that |
| /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, |
| /// the number of bytes consumed is always between 0 and 3, inclusive, where |
| /// 0 is only returned when `slice` is empty. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::decode_last_utf8; |
| /// |
| /// // Decoding a valid codepoint. |
| /// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83"); |
| /// assert_eq!(Some('☃'), ch); |
| /// assert_eq!(3, size); |
| /// |
| /// // Decoding an incomplete codepoint. |
| /// let (ch, size) = decode_last_utf8(b"\xE2\x98"); |
| /// assert_eq!(None, ch); |
| /// assert_eq!(2, size); |
| /// ``` |
| /// |
| /// This example shows how to iterate over all codepoints in UTF-8 encoded |
| /// bytes in reverse, while replacing invalid UTF-8 sequences with the |
| /// replacement codepoint: |
| /// |
| /// ``` |
| /// use bstr::{B, decode_last_utf8}; |
| /// |
| /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
| /// let mut chars = vec![]; |
| /// while !bytes.is_empty() { |
| /// let (ch, size) = decode_last_utf8(bytes); |
| /// bytes = &bytes[..bytes.len()-size]; |
| /// chars.push(ch.unwrap_or('\u{FFFD}')); |
| /// } |
| /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); |
| /// ``` |
| #[inline] |
| pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { |
| // TODO: We could implement this by reversing the UTF-8 automaton, but for |
| // now, we do it the slow way by using the forward automaton. |
| |
| let slice = slice.as_ref(); |
| if slice.is_empty() { |
| return (None, 0); |
| } |
| let mut start = slice.len() - 1; |
| let limit = slice.len().saturating_sub(4); |
| while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) { |
| start -= 1; |
| } |
| let (ch, size) = decode(&slice[start..]); |
| // If we didn't consume all of the bytes, then that means there's at least |
| // one stray byte that never occurs in a valid code unit prefix, so we can |
| // advance by one byte. |
| if start + size != slice.len() { |
| (None, 1) |
| } else { |
| (ch, size) |
| } |
| } |
| |
| /// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice. |
| /// |
| /// When successful, the corresponding Unicode scalar value is returned along |
| /// with the number of bytes it was encoded with. The number of bytes consumed |
| /// for a successful decode is always between 1 and 4, inclusive. |
| /// |
| /// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned |
| /// along with the number of bytes that make up a maximal prefix of a valid |
| /// UTF-8 code unit sequence. In this case, the number of bytes consumed is |
| /// always between 0 and 3, inclusive, where 0 is only returned when `slice` is |
| /// empty. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ```ignore |
| /// use bstr::decode_last_utf8_lossy; |
| /// |
| /// // Decoding a valid codepoint. |
| /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83"); |
| /// assert_eq!('☃', ch); |
| /// assert_eq!(3, size); |
| /// |
| /// // Decoding an incomplete codepoint. |
| /// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98"); |
| /// assert_eq!('\u{FFFD}', ch); |
| /// assert_eq!(2, size); |
| /// ``` |
| /// |
| /// This example shows how to iterate over all codepoints in UTF-8 encoded |
| /// bytes in reverse, while replacing invalid UTF-8 sequences with the |
| /// replacement codepoint: |
| /// |
| /// ```ignore |
| /// use bstr::decode_last_utf8_lossy; |
| /// |
| /// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"); |
| /// let mut chars = vec![]; |
| /// while !bytes.is_empty() { |
| /// let (ch, size) = decode_last_utf8_lossy(bytes); |
| /// bytes = &bytes[..bytes.len()-size]; |
| /// chars.push(ch); |
| /// } |
| /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); |
| /// ``` |
| #[inline] |
| pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { |
| match decode_last(slice) { |
| (Some(ch), size) => (ch, size), |
| (None, size) => ('\u{FFFD}', size), |
| } |
| } |
| |
| /// SAFETY: The decode function relies on state being equal to ACCEPT only if |
| /// cp is a valid Unicode scalar value. |
| #[inline] |
| pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { |
| let class = CLASSES[b as usize]; |
| if *state == ACCEPT { |
| *cp = (0xFF >> class) & (b as u32); |
| } else { |
| *cp = (b as u32 & 0b111111) | (*cp << 6); |
| } |
| *state = STATES_FORWARD[*state + class as usize] as usize; |
| } |
| |
| /// Returns true if and only if the given byte is either a valid leading UTF-8 |
| /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
| /// valid UTF-8 sequence. |
| fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { |
| // In the ASCII case, the most significant bit is never set. The leading |
| // byte of a 2/3/4-byte sequence always has the top two most significant |
| // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
| // also returns true, since every such byte has its two most significant |
| // bits set: |
| // |
| // \xC0 :: 11000000 |
| // \xC1 :: 11000001 |
| // \xF5 :: 11110101 |
| // \xF6 :: 11110110 |
| // \xF7 :: 11110111 |
| // \xF8 :: 11111000 |
| // \xF9 :: 11111001 |
| // \xFA :: 11111010 |
| // \xFB :: 11111011 |
| // \xFC :: 11111100 |
| // \xFD :: 11111101 |
| // \xFE :: 11111110 |
| // \xFF :: 11111111 |
| (b & 0b1100_0000) != 0b1000_0000 |
| } |
| |
| #[cfg(all(test, feature = "std"))] |
| mod tests { |
| use core::char; |
| |
| use alloc::{string::String, vec, vec::Vec}; |
| |
| use crate::{ |
| ext_slice::{ByteSlice, B}, |
| tests::LOSSY_TESTS, |
| utf8::{self, Utf8Error}, |
| }; |
| |
| fn utf8e(valid_up_to: usize) -> Utf8Error { |
| Utf8Error { valid_up_to, error_len: None } |
| } |
| |
| fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error { |
| Utf8Error { valid_up_to, error_len: Some(error_len) } |
| } |
| |
| #[test] |
| #[cfg(not(miri))] |
| fn validate_all_codepoints() { |
| for i in 0..(0x10FFFF + 1) { |
| let cp = match char::from_u32(i) { |
| None => continue, |
| Some(cp) => cp, |
| }; |
| let mut buf = [0; 4]; |
| let s = cp.encode_utf8(&mut buf); |
| assert_eq!(Ok(()), utf8::validate(s.as_bytes())); |
| } |
| } |
| |
| #[test] |
| fn validate_multiple_codepoints() { |
| assert_eq!(Ok(()), utf8::validate(b"abc")); |
| assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a")); |
| assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a")); |
| assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",)); |
| assert_eq!( |
| Ok(()), |
| utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",) |
| ); |
| assert_eq!( |
| Ok(()), |
| utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",) |
| ); |
| } |
| |
| #[test] |
| fn validate_errors() { |
| // single invalid byte |
| assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF")); |
| // single invalid byte after ASCII |
| assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF")); |
| // single invalid byte after 2 byte sequence |
| assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF")); |
| // single invalid byte after 3 byte sequence |
| assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF")); |
| // single invalid byte after 4 byte sequence |
| assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF")); |
| |
| // An invalid 2-byte sequence with a valid 1-byte prefix. |
| assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0")); |
| // An invalid 3-byte sequence with a valid 2-byte prefix. |
| assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0")); |
| // An invalid 4-byte sequence with a valid 3-byte prefix. |
| assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0")); |
| |
| // An overlong sequence. Should be \xE2\x82\xAC, but we encode the |
| // same codepoint value in 4 bytes. This not only tests that we reject |
| // overlong sequences, but that we get valid_up_to correct. |
| assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC")); |
| assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC")); |
| assert_eq!( |
| Err(utf8e2(3, 1)), |
| utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",) |
| ); |
| |
| // Check that encoding a surrogate codepoint using the UTF-8 scheme |
| // fails validation. |
| assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80")); |
| assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80")); |
| assert_eq!( |
| Err(utf8e2(3, 1)), |
| utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",) |
| ); |
| |
| // Check that an incomplete 2-byte sequence fails. |
| assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa")); |
| assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa")); |
| assert_eq!( |
| Err(utf8e2(3, 1)), |
| utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",) |
| ); |
| // Check that an incomplete 3-byte sequence fails. |
| assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a")); |
| assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a")); |
| assert_eq!( |
| Err(utf8e2(3, 2)), |
| utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",) |
| ); |
| // Check that an incomplete 4-byte sequence fails. |
| assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca")); |
| assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca")); |
| assert_eq!( |
| Err(utf8e2(4, 3)), |
| utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",) |
| ); |
| assert_eq!( |
| Err(utf8e2(6, 3)), |
| utf8::validate(b"foobar\xF1\x80\x80quux",) |
| ); |
| |
| // Check that an incomplete (EOF) 2-byte sequence fails. |
| assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE")); |
| assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE")); |
| assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE")); |
| // Check that an incomplete (EOF) 3-byte sequence fails. |
| assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98")); |
| assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98")); |
| assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98")); |
| // Check that an incomplete (EOF) 4-byte sequence fails. |
| assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C")); |
| assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C")); |
| assert_eq!( |
| Err(utf8e(4)), |
| utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",) |
| ); |
| |
| // Test that we errors correct even after long valid sequences. This |
| // checks that our "backup" logic for detecting errors is correct. |
| assert_eq!( |
| Err(utf8e2(8, 1)), |
| utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",) |
| ); |
| } |
| |
| #[test] |
| fn decode_valid() { |
| fn d(mut s: &str) -> Vec<char> { |
| let mut chars = vec![]; |
| while !s.is_empty() { |
| let (ch, size) = utf8::decode(s.as_bytes()); |
| s = &s[size..]; |
| chars.push(ch.unwrap()); |
| } |
| chars |
| } |
| |
| assert_eq!(vec!['☃'], d("☃")); |
| assert_eq!(vec!['☃', '☃'], d("☃☃")); |
| assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε")); |
| assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇")); |
| assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲")); |
| } |
| |
| #[test] |
| fn decode_invalid() { |
| let (ch, size) = utf8::decode(b""); |
| assert_eq!(None, ch); |
| assert_eq!(0, size); |
| |
| let (ch, size) = utf8::decode(b"\xFF"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode(b"\xCE\xF0"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode(b"\xE2\x98\xF0"); |
| assert_eq!(None, ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode(b"\xF0\x9D\x9D"); |
| assert_eq!(None, ch); |
| assert_eq!(3, size); |
| |
| let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0"); |
| assert_eq!(None, ch); |
| assert_eq!(3, size); |
| |
| let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode(b"\xED\xA0\x80"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode(b"\xCEa"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode(b"\xE2\x98a"); |
| assert_eq!(None, ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca"); |
| assert_eq!(None, ch); |
| assert_eq!(3, size); |
| } |
| |
| #[test] |
| fn decode_lossy() { |
| let (ch, size) = utf8::decode_lossy(b""); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(0, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xFF"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xCE\xF0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(3, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xCEa"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xE2\x98a"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(3, size); |
| } |
| |
| #[test] |
| fn decode_last_valid() { |
| fn d(mut s: &str) -> Vec<char> { |
| let mut chars = vec![]; |
| while !s.is_empty() { |
| let (ch, size) = utf8::decode_last(s.as_bytes()); |
| s = &s[..s.len() - size]; |
| chars.push(ch.unwrap()); |
| } |
| chars |
| } |
| |
| assert_eq!(vec!['☃'], d("☃")); |
| assert_eq!(vec!['☃', '☃'], d("☃☃")); |
| assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε")); |
| assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇")); |
| assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲")); |
| } |
| |
| #[test] |
| fn decode_last_invalid() { |
| let (ch, size) = utf8::decode_last(b""); |
| assert_eq!(None, ch); |
| assert_eq!(0, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xFF"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xCE\xF0"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xCE"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xE2\x98"); |
| assert_eq!(None, ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D"); |
| assert_eq!(None, ch); |
| assert_eq!(3, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xED\xA0\x80"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xED\xA0"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"\xED"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"a\xCE"); |
| assert_eq!(None, ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last(b"a\xE2\x98"); |
| assert_eq!(None, ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C"); |
| assert_eq!(None, ch); |
| assert_eq!(3, size); |
| } |
| |
| #[test] |
| fn decode_last_lossy() { |
| let (ch, size) = utf8::decode_last_lossy(b""); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(0, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xFF"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xCE"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(3, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"\xED"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"a\xCE"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(1, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(2, size); |
| |
| let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C"); |
| assert_eq!('\u{FFFD}', ch); |
| assert_eq!(3, size); |
| } |
| |
| #[test] |
| fn chars() { |
| for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() { |
| let got: String = B(input).chars().collect(); |
| assert_eq!( |
| expected, got, |
| "chars(ith: {:?}, given: {:?})", |
| i, input, |
| ); |
| let got: String = |
| B(input).char_indices().map(|(_, _, ch)| ch).collect(); |
| assert_eq!( |
| expected, got, |
| "char_indices(ith: {:?}, given: {:?})", |
| i, input, |
| ); |
| |
| let expected: String = expected.chars().rev().collect(); |
| |
| let got: String = B(input).chars().rev().collect(); |
| assert_eq!( |
| expected, got, |
| "chars.rev(ith: {:?}, given: {:?})", |
| i, input, |
| ); |
| let got: String = |
| B(input).char_indices().rev().map(|(_, _, ch)| ch).collect(); |
| assert_eq!( |
| expected, got, |
| "char_indices.rev(ith: {:?}, given: {:?})", |
| i, input, |
| ); |
| } |
| } |
| |
| #[test] |
| fn utf8_chunks() { |
| let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" }; |
| assert_eq!( |
| (c.next(), c.next()), |
| ( |
| Some(utf8::Utf8Chunk { |
| valid: "123", |
| invalid: b"\xC0".as_bstr(), |
| incomplete: false, |
| }), |
| None, |
| ) |
| ); |
| |
| let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" }; |
| assert_eq!( |
| (c.next(), c.next(), c.next()), |
| ( |
| Some(utf8::Utf8Chunk { |
| valid: "123", |
| invalid: b"\xFF".as_bstr(), |
| incomplete: false, |
| }), |
| Some(utf8::Utf8Chunk { |
| valid: "", |
| invalid: b"\xFF".as_bstr(), |
| incomplete: false, |
| }), |
| None, |
| ) |
| ); |
| |
| let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" }; |
| assert_eq!( |
| (c.next(), c.next()), |
| ( |
| Some(utf8::Utf8Chunk { |
| valid: "123", |
| invalid: b"\xD0".as_bstr(), |
| incomplete: true, |
| }), |
| None, |
| ) |
| ); |
| |
| let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" }; |
| assert_eq!( |
| (c.next(), c.next(), c.next()), |
| ( |
| Some(utf8::Utf8Chunk { |
| valid: "123", |
| invalid: b"\xD0".as_bstr(), |
| incomplete: false, |
| }), |
| Some(utf8::Utf8Chunk { |
| valid: "456", |
| invalid: b"".as_bstr(), |
| incomplete: false, |
| }), |
| None, |
| ) |
| ); |
| |
| let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" }; |
| assert_eq!( |
| (c.next(), c.next()), |
| ( |
| Some(utf8::Utf8Chunk { |
| valid: "123", |
| invalid: b"\xE2\x98".as_bstr(), |
| incomplete: true, |
| }), |
| None, |
| ) |
| ); |
| |
| let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" }; |
| assert_eq!( |
| (c.next(), c.next()), |
| ( |
| Some(utf8::Utf8Chunk { |
| valid: "123", |
| invalid: b"\xF4\x8F\xBF".as_bstr(), |
| incomplete: true, |
| }), |
| None, |
| ) |
| ); |
| } |
| } |