vendor/futf-0.1.5/src/lib.rs - toolchain/rustc - Git at Google

 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 #![cfg_attr(test, feature(test))]

 #[macro_use]
 extern crate debug_unreachable;

 #[macro_use]
 extern crate mac;

 #[cfg(test)]
 extern crate test as std_test;

 use std::{slice, char};

 /// Meaning of a complete or partial UTF-8 codepoint.
 ///
 /// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
 /// `Suffix` may in reality have no valid completion.
 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
 pub enum Meaning {
     /// We found a whole codepoint.
     Whole(char),

     /// We found something that isn't a valid Unicode codepoint, but
     /// it *would* correspond to a UTF-16 leading surrogate code unit,
     /// i.e. a value in the range `U+D800` - `U+DBFF`.
     ///
     /// The argument is the code unit's 10-bit index within that range.
     ///
     /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
     LeadSurrogate(u16),

     /// We found something that isn't a valid Unicode codepoint, but
     /// it *would* correspond to a UTF-16 trailing surrogate code unit,
     /// i.e. a value in the range `U+DC00` - `U+DFFF`.
     ///
     /// The argument is the code unit's 10-bit index within that range.
     ///
     /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
     TrailSurrogate(u16),

     /// We found only a prefix of a codepoint before the buffer ended.
     ///
     /// Includes the number of additional bytes needed.
     Prefix(usize),

     /// We found only a suffix of a codepoint before running off the
     /// start of the buffer.
     ///
     /// Up to 3 more bytes may be needed.
     Suffix,
 }

 /// Represents a complete or partial UTF-8 codepoint.
 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
 pub struct Codepoint<'a> {
     /// The bytes that make up the partial or full codepoint.
     ///
     /// For a `Suffix` this depends on `idx`. We don't scan forward
     /// for additional continuation bytes after the reverse scan
     /// failed to locate a multibyte sequence start.
     pub bytes: &'a [u8],

     /// Start of the codepoint in the buffer, expressed as an offset
     /// back from `idx`.
     pub rewind: usize,

     /// Meaning of the partial or full codepoint.
     pub meaning: Meaning,
 }

 #[derive(Debug, PartialEq, Eq)]
 enum Byte {
     Ascii,
     Start(usize),
     Cont,
 }

 impl Byte {
     #[inline(always)]
     fn classify(x: u8) -> Option<Byte> {
         match x & 0xC0 {
             0xC0 => match x {
                 x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
                 x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
                 x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
                 _ => None,
             },
             0x80 => Some(Byte::Cont),
             _ => Some(Byte::Ascii),
         }
     }
 }

 #[inline(always)]
 fn all_cont(buf: &[u8]) -> bool {
     buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
 }

 // NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
 // a starting byte followed by the correct number of continuation bytes.
 #[inline(always)]
 unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
     debug_assert!(buf.len() >= 2);
     debug_assert!(buf.len() <= 4);
     let n;
     match buf.len() {
         2 => {
             n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
                 | ((*buf.get_unchecked(1) & 0x3F) as u32);
             if n < 0x80 { return None }  // Overlong
         }
         3 => {
             n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
                 | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
                 | ((*buf.get_unchecked(2) & 0x3F) as u32);
             match n {
                 0x0000 ... 0x07FF => return None,  // Overlong
                 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
                 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
                 _ => {}
             }
         }
         4 => {
             n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
                 | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
                 | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
                 | ((*buf.get_unchecked(3) & 0x3F) as u32);
             if n < 0x1_0000 { return None }  // Overlong
         }
         _ => debug_unreachable!(),
     }

     char::from_u32(n).map(Meaning::Whole)
 }

 #[inline(always)]
 unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
     debug_assert!(start <= buf.len());
     debug_assert!(new_len <= (buf.len() - start));
     slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
 }

 macro_rules! otry {
     ($x:expr) => { unwrap_or_return!($x, None) }
 }

 /// Describes the UTF-8 codepoint containing the byte at index `idx` within
 /// `buf`.
 ///
 /// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
 /// in the vicinity of `idx`.
 #[inline]
 pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
     if idx >= buf.len() {
         return None;
     }

     unsafe {
         let x = *buf.get_unchecked(idx);
         match otry!(Byte::classify(x)) {
             Byte::Ascii => Some(Codepoint {
                 bytes: unsafe_slice(buf, idx, 1),
                 rewind: 0,
                 meaning: Meaning::Whole(x as char),
             }),
             Byte::Start(n) => {
                 let avail = buf.len() - idx;
                 if avail >= n {
                     let bytes = unsafe_slice(buf, idx, n);
                     if !all_cont(unsafe_slice(bytes, 1, n-1)) {
                         return None;
                     }
                     let meaning = otry!(decode(bytes));
                     Some(Codepoint {
                         bytes: bytes,
                         rewind: 0,
                         meaning: meaning,
                     })
                 } else {
                     Some(Codepoint {
                         bytes: unsafe_slice(buf, idx, avail),
                         rewind: 0,
                         meaning: Meaning::Prefix(n - avail),
                     })
                 }
             },
             Byte::Cont => {
                 let mut start = idx;
                 let mut checked = 0;
                 loop {
                     if start == 0 {
                         // Whoops, fell off the beginning.
                         return Some(Codepoint {
                             bytes: unsafe_slice(buf, 0, idx + 1),
                             rewind: idx,
                             meaning: Meaning::Suffix,
                         });
                     }

                     start -= 1;
                     checked += 1;
                     match otry!(Byte::classify(*buf.get_unchecked(start))) {
                         Byte::Cont => (),
                         Byte::Start(n) => {
                             let avail = buf.len() - start;
                             if avail >= n {
                                 let bytes = unsafe_slice(buf, start, n);
                                 if checked < n {
                                     if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
                                         return None;
                                     }
                                 }
                                 let meaning = otry!(decode(bytes));
                                 return Some(Codepoint {
                                     bytes: bytes,
                                     rewind: idx - start,
                                     meaning: meaning,
                                 });
                             } else {
                                 return Some(Codepoint {
                                     bytes: unsafe_slice(buf, start, avail),
                                     rewind: idx - start,
                                     meaning: Meaning::Prefix(n - avail),
                                 });
                             }
                         }
                         _ => return None,
                     }

                     if idx - start >= 3 {
                         // We looked at 3 bytes before a continuation byte
                         // and didn't find a start byte.
                         return None;
                     }
                 }
             }
         }
     }
 }

 #[cfg(test)]
 mod test;
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	#![cfg_attr(test, feature(test))]

	#[macro_use]
	extern crate debug_unreachable;

	#[macro_use]
	extern crate mac;

	#[cfg(test)]
	extern crate test as std_test;

	use std::{slice, char};

	/// Meaning of a complete or partial UTF-8 codepoint.
	///
	/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
	/// `Suffix` may in reality have no valid completion.
	#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
	pub enum Meaning {
	/// We found a whole codepoint.
	Whole(char),

	/// We found something that isn't a valid Unicode codepoint, but
	/// it would correspond to a UTF-16 leading surrogate code unit,
	/// i.e. a value in the range `U+D800` - `U+DBFF`.
	///
	/// The argument is the code unit's 10-bit index within that range.
	///
	/// These are found in UTF-8 variants such as CESU-8 and WTF-8.
	LeadSurrogate(u16),

	/// We found something that isn't a valid Unicode codepoint, but
	/// it would correspond to a UTF-16 trailing surrogate code unit,
	/// i.e. a value in the range `U+DC00` - `U+DFFF`.
	///
	/// The argument is the code unit's 10-bit index within that range.
	///
	/// These are found in UTF-8 variants such as CESU-8 and WTF-8.
	TrailSurrogate(u16),

	/// We found only a prefix of a codepoint before the buffer ended.
	///
	/// Includes the number of additional bytes needed.
	Prefix(usize),

	/// We found only a suffix of a codepoint before running off the
	/// start of the buffer.
	///
	/// Up to 3 more bytes may be needed.
	Suffix,
	}

	/// Represents a complete or partial UTF-8 codepoint.
	#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
	pub struct Codepoint<'a> {
	/// The bytes that make up the partial or full codepoint.
	///
	/// For a `Suffix` this depends on `idx`. We don't scan forward
	/// for additional continuation bytes after the reverse scan
	/// failed to locate a multibyte sequence start.
	pub bytes: &'a [u8],

	/// Start of the codepoint in the buffer, expressed as an offset
	/// back from `idx`.
	pub rewind: usize,

	/// Meaning of the partial or full codepoint.
	pub meaning: Meaning,
	}

	#[derive(Debug, PartialEq, Eq)]
	enum Byte {
	Ascii,
	Start(usize),
	Cont,
	}

	impl Byte {
	#[inline(always)]
	fn classify(x: u8) -> Option<Byte> {
	match x & 0xC0 {
	0xC0 => match x {
	x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
	x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
	x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
	_ => None,
	},
	0x80 => Some(Byte::Cont),
	_ => Some(Byte::Ascii),
	}
	}
	}

	#[inline(always)]
	fn all_cont(buf: &[u8]) -> bool {
	buf.iter().all(\|&b\| matches!(Byte::classify(b), Some(Byte::Cont)))
	}

	// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
	// a starting byte followed by the correct number of continuation bytes.
	#[inline(always)]
	unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
	debug_assert!(buf.len() >= 2);
	debug_assert!(buf.len() <= 4);
	let n;
	match buf.len() {
	2 => {
	n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
	\| ((*buf.get_unchecked(1) & 0x3F) as u32);
	if n < 0x80 { return None } // Overlong
	}
	3 => {
	n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
	\| ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
	\| ((*buf.get_unchecked(2) & 0x3F) as u32);
	match n {
	0x0000 ... 0x07FF => return None, // Overlong
	0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
	0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
	_ => {}
	}
	}
	4 => {
	n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
	\| ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
	\| ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
	\| ((*buf.get_unchecked(3) & 0x3F) as u32);
	if n < 0x1_0000 { return None } // Overlong
	}
	_ => debug_unreachable!(),
	}

	char::from_u32(n).map(Meaning::Whole)
	}

	#[inline(always)]
	unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
	debug_assert!(start <= buf.len());
	debug_assert!(new_len <= (buf.len() - start));
	slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
	}

	macro_rules! otry {
	($x:expr) => { unwrap_or_return!($x, None) }
	}

	/// Describes the UTF-8 codepoint containing the byte at index `idx` within
	/// `buf`.
	///
	/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
	/// in the vicinity of `idx`.
	#[inline]
	pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
	if idx >= buf.len() {
	return None;
	}

	unsafe {
	let x = *buf.get_unchecked(idx);
	match otry!(Byte::classify(x)) {
	Byte::Ascii => Some(Codepoint {
	bytes: unsafe_slice(buf, idx, 1),
	rewind: 0,
	meaning: Meaning::Whole(x as char),
	}),
	Byte::Start(n) => {
	let avail = buf.len() - idx;
	if avail >= n {
	let bytes = unsafe_slice(buf, idx, n);
	if !all_cont(unsafe_slice(bytes, 1, n-1)) {
	return None;
	}
	let meaning = otry!(decode(bytes));
	Some(Codepoint {
	bytes: bytes,
	rewind: 0,
	meaning: meaning,
	})
	} else {
	Some(Codepoint {
	bytes: unsafe_slice(buf, idx, avail),
	rewind: 0,
	meaning: Meaning::Prefix(n - avail),
	})
	}
	},
	Byte::Cont => {
	let mut start = idx;
	let mut checked = 0;
	loop {
	if start == 0 {
	// Whoops, fell off the beginning.
	return Some(Codepoint {
	bytes: unsafe_slice(buf, 0, idx + 1),
	rewind: idx,
	meaning: Meaning::Suffix,
	});
	}

	start -= 1;
	checked += 1;
	match otry!(Byte::classify(*buf.get_unchecked(start))) {
	Byte::Cont => (),
	Byte::Start(n) => {
	let avail = buf.len() - start;
	if avail >= n {
	let bytes = unsafe_slice(buf, start, n);
	if checked < n {
	if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
	return None;
	}
	}
	let meaning = otry!(decode(bytes));
	return Some(Codepoint {
	bytes: bytes,
	rewind: idx - start,
	meaning: meaning,
	});
	} else {
	return Some(Codepoint {
	bytes: unsafe_slice(buf, start, avail),
	rewind: idx - start,
	meaning: Meaning::Prefix(n - avail),
	});
	}
	}
	_ => return None,
	}

	if idx - start >= 3 {
	// We looked at 3 bytes before a continuation byte
	// and didn't find a start byte.
	return None;
	}
	}
	}
	}
	}
	}

	#[cfg(test)]
	mod test;