blob: 2ff04bbca4ffa4c97e5e14229ab4625fbe691077 [file] [log] [blame] [edit]
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Marker types for formats.
//!
//! This module defines the types and traits used to mark a `Tendril`
//! with the format of data it contains. It includes those formats
//! for which `Tendril` supports at least some operations without
//! conversion.
//!
//! To convert a string tendril to/from a byte tendril in an arbitrary
//! character encoding, see the `encode` and `decode` methods on
//! `Tendril`.
//!
//! `Tendril` operations may become memory-unsafe if data invalid for
//! the format sneaks in. For that reason, these traits require
//! `unsafe impl`.
use std::default::Default;
use std::{char, mem, str};
use futf::{self, Codepoint, Meaning};
/// Implementation details.
///
/// You don't need these unless you are implementing
/// a new format.
pub mod imp {
use std::default::Default;
use std::{iter, mem, slice};
/// Describes how to fix up encodings when concatenating.
///
/// We can drop characters on either side of the splice,
/// and insert up to 4 bytes in the middle.
pub struct Fixup {
pub drop_left: u32,
pub drop_right: u32,
pub insert_len: u32,
pub insert_bytes: [u8; 4],
}
impl Default for Fixup {
#[inline(always)]
fn default() -> Fixup {
Fixup {
drop_left: 0,
drop_right: 0,
insert_len: 0,
insert_bytes: [0; 4],
}
}
}
#[inline(always)]
unsafe fn from_u32_unchecked(n: u32) -> char {
mem::transmute(n)
}
pub struct SingleByteCharIndices<'a> {
inner: iter::Enumerate<slice::Iter<'a, u8>>,
}
impl<'a> Iterator for SingleByteCharIndices<'a> {
type Item = (usize, char);
#[inline]
fn next(&mut self) -> Option<(usize, char)> {
self.inner
.next()
.map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
}
}
impl<'a> SingleByteCharIndices<'a> {
#[inline]
pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
SingleByteCharIndices {
inner: buf.iter().enumerate(),
}
}
}
}
/// Trait for format marker types.
///
/// The type implementing this trait is usually not instantiated.
/// It's used with a phantom type parameter of `Tendril`.
pub unsafe trait Format {
/// Check whether the buffer is valid for this format.
fn validate(buf: &[u8]) -> bool;
/// Check whether the buffer is valid for this format.
///
/// You may assume the buffer is a prefix of a valid buffer.
#[inline]
fn validate_prefix(buf: &[u8]) -> bool {
<Self as Format>::validate(buf)
}
/// Check whether the buffer is valid for this format.
///
/// You may assume the buffer is a suffix of a valid buffer.
#[inline]
fn validate_suffix(buf: &[u8]) -> bool {
<Self as Format>::validate(buf)
}
/// Check whether the buffer is valid for this format.
///
/// You may assume the buffer is a contiguous subsequence
/// of a valid buffer, but not necessarily a prefix or
/// a suffix.
#[inline]
fn validate_subseq(buf: &[u8]) -> bool {
<Self as Format>::validate(buf)
}
/// Compute any fixup needed when concatenating buffers.
///
/// The default is to do nothing.
///
/// The function is `unsafe` because it may assume the input
/// buffers are already valid for the format. Also, no
/// bounds-checking is performed on the return value!
#[inline(always)]
unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
Default::default()
}
}
/// Indicates that one format is a subset of another.
///
/// The subset format can be converted to the superset format
/// for free.
pub unsafe trait SubsetOf<Super>: Format
where
Super: Format,
{
/// Validate the *other* direction of conversion; check if
/// this buffer from the superset format conforms to the
/// subset format.
///
/// The default calls `Self::validate`, but some conversions
/// may implement a check which is cheaper than validating
/// from scratch.
fn revalidate_subset(x: &[u8]) -> bool {
Self::validate(x)
}
}
/// Indicates a format which corresponds to a Rust slice type,
/// representing exactly the same invariants.
pub unsafe trait SliceFormat: Format + Sized {
type Slice: ?Sized + Slice;
}
/// Indicates a format which contains characters from Unicode
/// (all of it, or some proper subset).
pub unsafe trait CharFormat<'a>: Format {
/// Iterator for characters and their byte indices.
type Iter: Iterator<Item = (usize, char)>;
/// Iterate over the characters of the string and their byte
/// indices.
///
/// You may assume the buffer is *already validated* for `Format`.
unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
/// Encode the character as bytes and pass them to a continuation.
///
/// Returns `Err(())` iff the character cannot be represented.
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where
F: FnOnce(&[u8]);
}
/// Indicates a Rust slice type that is represented in memory as bytes.
pub unsafe trait Slice {
/// Access the raw bytes of the slice.
fn as_bytes(&self) -> &[u8];
/// Convert a byte slice to this kind of slice.
///
/// You may assume the buffer is *already validated*
/// for `Format`.
unsafe fn from_bytes(x: &[u8]) -> &Self;
/// Convert a byte slice to this kind of slice.
///
/// You may assume the buffer is *already validated*
/// for `Format`.
unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
}
/// Marker type for uninterpreted bytes.
///
/// Validation will never fail for this format.
#[derive(Copy, Clone, Default, Debug)]
pub struct Bytes;
unsafe impl Format for Bytes {
#[inline(always)]
fn validate(_: &[u8]) -> bool {
true
}
}
unsafe impl SliceFormat for Bytes {
type Slice = [u8];
}
unsafe impl Slice for [u8] {
#[inline(always)]
fn as_bytes(&self) -> &[u8] {
self
}
#[inline(always)]
unsafe fn from_bytes(x: &[u8]) -> &[u8] {
x
}
#[inline(always)]
unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
x
}
}
/// Marker type for ASCII text.
#[derive(Copy, Clone, Default, Debug)]
pub struct ASCII;
unsafe impl Format for ASCII {
#[inline]
fn validate(buf: &[u8]) -> bool {
buf.iter().all(|&n| n <= 127)
}
#[inline(always)]
fn validate_prefix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_suffix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_subseq(_: &[u8]) -> bool {
true
}
}
unsafe impl SubsetOf<UTF8> for ASCII {}
unsafe impl SubsetOf<Latin1> for ASCII {}
unsafe impl<'a> CharFormat<'a> for ASCII {
type Iter = imp::SingleByteCharIndices<'a>;
#[inline]
unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
imp::SingleByteCharIndices::new(buf)
}
#[inline]
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where
F: FnOnce(&[u8]),
{
let n = ch as u32;
if n > 0x7F {
return Err(());
}
cont(&[n as u8]);
Ok(())
}
}
/// Marker type for UTF-8 text.
#[derive(Copy, Clone, Default, Debug)]
pub struct UTF8;
unsafe impl Format for UTF8 {
#[inline]
fn validate(buf: &[u8]) -> bool {
str::from_utf8(buf).is_ok()
}
#[inline]
fn validate_prefix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, buf.len() - 1) {
Some(Codepoint {
meaning: Meaning::Whole(_),
..
}) => true,
_ => false,
}
}
#[inline]
fn validate_suffix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, 0) {
Some(Codepoint {
meaning: Meaning::Whole(_),
..
}) => true,
_ => false,
}
}
#[inline]
fn validate_subseq(buf: &[u8]) -> bool {
<Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
}
}
unsafe impl SubsetOf<WTF8> for UTF8 {}
unsafe impl SliceFormat for UTF8 {
type Slice = str;
}
unsafe impl Slice for str {
#[inline(always)]
fn as_bytes(&self) -> &[u8] {
str::as_bytes(self)
}
#[inline(always)]
unsafe fn from_bytes(x: &[u8]) -> &str {
str::from_utf8_unchecked(x)
}
#[inline(always)]
unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
mem::transmute(x)
}
}
unsafe impl<'a> CharFormat<'a> for UTF8 {
type Iter = str::CharIndices<'a>;
#[inline]
unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
str::from_utf8_unchecked(buf).char_indices()
}
#[inline]
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where
F: FnOnce(&[u8]),
{
cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
Ok(())
}
}
/// Marker type for WTF-8 text.
///
/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
#[derive(Copy, Clone, Default, Debug)]
pub struct WTF8;
#[inline]
fn wtf8_meaningful(m: Meaning) -> bool {
match m {
Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
_ => false,
}
}
unsafe impl Format for WTF8 {
#[inline]
fn validate(buf: &[u8]) -> bool {
let mut i = 0;
let mut prev_lead = false;
while i < buf.len() {
let codept = unwrap_or_return!(futf::classify(buf, i), false);
if !wtf8_meaningful(codept.meaning) {
return false;
}
i += codept.bytes.len();
prev_lead = match codept.meaning {
Meaning::TrailSurrogate(_) if prev_lead => return false,
Meaning::LeadSurrogate(_) => true,
_ => false,
};
}
true
}
#[inline]
fn validate_prefix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, buf.len() - 1) {
Some(c) => wtf8_meaningful(c.meaning),
_ => false,
}
}
#[inline]
fn validate_suffix(buf: &[u8]) -> bool {
if buf.len() == 0 {
return true;
}
match futf::classify(buf, 0) {
Some(c) => wtf8_meaningful(c.meaning),
_ => false,
}
}
#[inline]
fn validate_subseq(buf: &[u8]) -> bool {
<Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
}
#[inline]
unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
const ERR: &'static str = "WTF8: internal error";
if lhs.len() >= 3 && rhs.len() >= 3 {
if let (
Some(Codepoint {
meaning: Meaning::LeadSurrogate(hi),
..
}),
Some(Codepoint {
meaning: Meaning::TrailSurrogate(lo),
..
}),
) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
{
let mut fixup = imp::Fixup {
drop_left: 3,
drop_right: 3,
insert_len: 0,
insert_bytes: [0_u8; 4],
};
let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
let ch = char::from_u32(n).expect(ERR);
fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
return fixup;
}
}
Default::default()
}
}
/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
///
/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
/// C0 and C1 control characters from ECMA-48 / ISO 6429.
///
/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
/// many other aliases), which actually stand for Windows-1252.
#[derive(Copy, Clone, Default, Debug)]
pub struct Latin1;
unsafe impl Format for Latin1 {
#[inline(always)]
fn validate(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_prefix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_suffix(_: &[u8]) -> bool {
true
}
#[inline(always)]
fn validate_subseq(_: &[u8]) -> bool {
true
}
}
unsafe impl<'a> CharFormat<'a> for Latin1 {
type Iter = imp::SingleByteCharIndices<'a>;
#[inline]
unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
imp::SingleByteCharIndices::new(buf)
}
#[inline]
fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
where
F: FnOnce(&[u8]),
{
let n = ch as u32;
if n > 0xFF {
return Err(());
}
cont(&[n as u8]);
Ok(())
}
}