blob: 8eac6927a9a3c67a855f83c1fad9dd94cc491b48 [file] [log] [blame] [edit]
// Copyright © 2018 Phil Booth
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at:
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
//! Detects and classifies
//! [Unicode byte-order marks](https://en.wikipedia.org/wiki/Byte_order_mark).
//!
//! ## Usage
//!
//! ```
//! use unicode_bom::Bom;
//!
//! // Detect the UTF-32 (little-endian) BOM in a file on disk
//! let bom: Bom = "fixtures/utf32-le.txt".parse().unwrap();
//! assert_eq!(bom, Bom::Utf32Le);
//! assert_eq!(bom.len(), 4);
//!
//! // Detect the UTF-16 (little-endian) BOM in a file on disk
//! let bom: Bom = "fixtures/utf16-le.txt".parse().unwrap();
//! assert_eq!(bom, Bom::Utf16Le);
//! assert_eq!(bom.len(), 2);
//!
//! // Detect no BOM in a file on disk
//! let bom: Bom = "fixtures/ascii.txt".parse().unwrap();
//! assert_eq!(bom, Bom::Null);
//! assert_eq!(bom.len(), 0);
//!
//! // Detect the BOM in a byte array
//! let bytes = [0u8, 0u8, 0xfeu8, 0xffu8];
//! assert_eq!(Bom::from(&bytes[0..]), Bom::Utf32Be);
//! ```
use std::fmt::{self, Display, Formatter};
use std::fs::File;
use std::io::{Error, ErrorKind, Read};
use std::str::FromStr;
#[cfg(test)]
mod test;
/// Unicode byte-order mark (BOM) abstraction.
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Bom {
/// Indicates no BOM was detected.
Null,
/// Indicates [BOCU-1](https://www.unicode.org/notes/tn6/) BOM was detected.
Bocu1,
/// Indicates [GB 18030](https://en.wikipedia.org/wiki/GB_18030) BOM was detected.
Gb18030,
/// Indicates [SCSU](https://www.unicode.org/reports/tr6/) BOM was detected.
Scsu,
/// Indicates [UTF-EBCIDC](https://www.unicode.org/reports/tr16/) BOM was detected.
UtfEbcdic,
/// Indicates [UTF-1](https://en.wikipedia.org/wiki/UTF-1) BOM was detected.
Utf1,
/// Indicates [UTF-7](https://tools.ietf.org/html/rfc2152) BOM was detected.
Utf7,
/// Indicates [UTF-8](https://tools.ietf.org/html/rfc3629) BOM was detected.
Utf8,
/// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (big-endian) BOM was detected.
Utf16Be,
/// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (little-endian) BOM was detected.
Utf16Le,
/// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (big-endian) BOM was detected.
Utf32Be,
/// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (little-endian) BOM was detected.
Utf32Le,
}
impl Bom {
/// Returns the size in bytes of the BOM.
pub fn len(&self) -> usize {
match *self {
Bom::Null => 0,
Bom::Bocu1 => 3,
Bom::Gb18030 => 4,
Bom::Scsu => 3,
Bom::UtfEbcdic => 4,
Bom::Utf1 => 3,
Bom::Utf7 => 4,
Bom::Utf8 => 3,
Bom::Utf16Be => 2,
Bom::Utf16Le => 2,
Bom::Utf32Be => 4,
Bom::Utf32Le => 4,
}
}
}
impl AsRef<str> for Bom {
/// Returns a `&str` representation of the BOM type.
fn as_ref(&self) -> &str {
match *self {
Bom::Null => "[not set]",
Bom::Bocu1 => "BOCU-1",
Bom::Gb18030 => "GB 18030",
Bom::Scsu => "SCSU",
Bom::UtfEbcdic => "UTF-EBCDIC",
Bom::Utf1 => "UTF-1",
Bom::Utf7 => "UTF-7",
Bom::Utf8 => "UTF-8",
Bom::Utf16Be => "UTF-16 (big-endian)",
Bom::Utf16Le => "UTF-16 (little-endian)",
Bom::Utf32Be => "UTF-32 (big-endian)",
Bom::Utf32Le => "UTF-32 (little-endian)",
}
}
}
impl AsRef<[u8]> for Bom {
/// Returns the BOM byte-array literal.
///
/// Note that for UTF-7,
/// only the first three bytes of the BOM are returned.
/// That's because the last two bits of the fourth byte
/// belong to the following character,
/// so it's impossible to return the fourth byte
/// without further context.
/// Possible values for the missing fourth byte
/// are `0x38`, `0x39`, `0x2a` and `0x2b`.
fn as_ref(&self) -> &[u8] {
match *self {
Bom::Null => &[],
Bom::Bocu1 => &[0xfb, 0xee, 0x28],
Bom::Gb18030 => &[0x84, 0x31, 0x95, 0x33],
Bom::Scsu => &[0x0e, 0xfe, 0xff],
Bom::UtfEbcdic => &[0xdd, 0x73, 0x66, 0x73],
Bom::Utf1 => &[0xf7, 0x64, 0x4c],
Bom::Utf7 => &[0x2b, 0x2f, 0x76],
Bom::Utf8 => &[0xef, 0xbb, 0xbf],
Bom::Utf16Be => &[0xfe, 0xff],
Bom::Utf16Le => &[0xff, 0xfe],
Bom::Utf32Be => &[0, 0, 0xfe, 0xff],
Bom::Utf32Le => &[0xff, 0xfe, 0, 0],
}
}
}
impl Default for Bom {
/// Returns the default/empty BOM type, `Bom::Null`.
fn default() -> Self {
Bom::Null
}
}
impl Display for Bom {
/// Formats the BOM type as a `String`.
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
write!(formatter, "{}", AsRef::<str>::as_ref(self))
}
}
impl Eq for Bom {}
macro_rules! compare_tail {
($slice:ident, $bytes:expr) => {
compare_tail!($slice, $bytes, 1)
};
($slice:ident, $bytes:expr, $from:expr) => {
compare_tail!($slice, $bytes.len() + $from, $bytes, $from)
};
($slice:ident, $len:expr, $bytes:expr, $from:expr) => {
$slice.len() >= $len && $slice[$from..$from + $bytes.len()] == $bytes
};
}
impl From<&[u8]> for Bom {
/// Detect the BOM type from a byte array.
fn from(slice: &[u8]) -> Self {
if slice.len() >= 2 {
match slice[0] {
0 => {
if compare_tail!(slice, [0, 0xfe, 0xff]) {
return Bom::Utf32Be;
}
}
0x0e => {
if compare_tail!(slice, [0xfe, 0xff]) {
return Bom::Scsu;
}
}
0x2b => {
if compare_tail!(slice, 4, [0x2f, 0x76], 1)
&& (slice[3] == 0x38
|| slice[3] == 0x39
|| slice[3] == 0x2b
|| slice[3] == 0x2f)
{
return Bom::Utf7;
}
}
0x84 => {
if compare_tail!(slice, [0x31, 0x95, 0x33]) {
return Bom::Gb18030;
}
}
0xdd => {
if compare_tail!(slice, [0x73, 0x66, 0x73]) {
return Bom::UtfEbcdic;
}
}
0xef => {
if compare_tail!(slice, [0xbb, 0xbf]) {
return Bom::Utf8;
}
}
0xf7 => {
if compare_tail!(slice, [0x64, 0x4c]) {
return Bom::Utf1;
}
}
0xfb => {
if compare_tail!(slice, [0xee, 0x28]) {
return Bom::Bocu1;
}
}
0xfe => {
if slice[1] == 0xff {
return Bom::Utf16Be;
}
}
0xff => {
if slice[1] == 0xfe {
if compare_tail!(slice, [0, 0], 2) {
return Bom::Utf32Le;
}
return Bom::Utf16Le;
}
}
_ => {}
}
}
Bom::Null
}
}
impl From<&mut File> for Bom {
/// Detect the BOM type from a `File` instance.
///
/// Note that I/O errors are swallowed by this method.
/// Instead the default type, `Bom::Null`,
/// will be returned.
fn from(file: &mut File) -> Self {
let mut data = [0u8; 4];
let mut result = file.read_exact(&mut data);
if let Err(ref error) = result {
if error.kind() == ErrorKind::UnexpectedEof {
let short_data = [0u8; 3];
result = file.read_exact(&mut data);
if let Err(ref error) = result {
if error.kind() == ErrorKind::UnexpectedEof {
let short_data = [0u8; 2];
result = file.read_exact(&mut data);
data[0] = short_data[0];
data[1] = short_data[1];
}
} else {
data[0] = short_data[0];
data[1] = short_data[1];
data[2] = short_data[2];
}
}
}
if result.is_ok() {
Bom::from(&data[0..])
} else {
Bom::Null
}
}
}
impl FromStr for Bom {
/// A `std::io::Error` instance returned by `std::fs::File::open`.
type Err = Error;
/// Parse the BOM type from the file located at `path`.
fn from_str(path: &str) -> Result<Self, Self::Err> {
let mut file = File::open(path)?;
Ok(Bom::from(&mut file))
}
}