| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| //! Streams of tendrils. |
| |
| use fmt; |
| use tendril::{Atomicity, NonAtomic, Tendril}; |
| |
| use std::borrow::Cow; |
| use std::fs::File; |
| use std::io; |
| use std::marker::PhantomData; |
| use std::path::Path; |
| |
| #[cfg(feature = "encoding")] |
| use encoding; |
| #[cfg(feature = "encoding_rs")] |
| use encoding_rs::{self, DecoderResult}; |
| use utf8; |
| |
| /// Trait for types that can process a tendril. |
| /// |
| /// This is a "push" interface, unlike the "pull" interface of |
| /// `Iterator<Item=Tendril<F>>`. The push interface matches |
| /// [html5ever][] and other incremental parsers with a similar |
| /// architecture. |
| /// |
| /// [html5ever]: https://github.com/servo/html5ever |
| pub trait TendrilSink<F, A = NonAtomic> |
| where |
| F: fmt::Format, |
| A: Atomicity, |
| { |
| /// Process this tendril. |
| fn process(&mut self, t: Tendril<F, A>); |
| |
| /// Indicates that an error has occurred. |
| fn error(&mut self, desc: Cow<'static, str>); |
| |
| /// What the overall result of processing is. |
| type Output; |
| |
| /// Indicates the end of the stream. |
| fn finish(self) -> Self::Output; |
| |
| /// Process one tendril and finish. |
| fn one<T>(mut self, t: T) -> Self::Output |
| where |
| Self: Sized, |
| T: Into<Tendril<F, A>>, |
| { |
| self.process(t.into()); |
| self.finish() |
| } |
| |
| /// Consume an iterator of tendrils, processing each item, then finish. |
| fn from_iter<I>(mut self, i: I) -> Self::Output |
| where |
| Self: Sized, |
| I: IntoIterator, |
| I::Item: Into<Tendril<F, A>>, |
| { |
| for t in i { |
| self.process(t.into()) |
| } |
| self.finish() |
| } |
| |
| /// Read from the given stream of bytes until exhaustion and process incrementally, |
| /// then finish. Return `Err` at the first I/O error. |
| fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> |
| where |
| Self: Sized, |
| R: io::Read, |
| F: fmt::SliceFormat<Slice = [u8]>, |
| { |
| const BUFFER_SIZE: u32 = 4 * 1024; |
| loop { |
| let mut tendril = Tendril::<F, A>::new(); |
| // FIXME: this exposes uninitialized bytes to a generic R type |
| // this is fine for R=File which never reads these bytes, |
| // but user-defined types might. |
| // The standard library pushes zeros to `Vec<u8>` for that reason. |
| unsafe { |
| tendril.push_uninitialized(BUFFER_SIZE); |
| } |
| loop { |
| match r.read(&mut tendril) { |
| Ok(0) => return Ok(self.finish()), |
| Ok(n) => { |
| tendril.pop_back(BUFFER_SIZE - n as u32); |
| self.process(tendril); |
| break; |
| } |
| Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} |
| Err(e) => return Err(e), |
| } |
| } |
| } |
| } |
| |
| /// Read from the file at the given path and process incrementally, |
| /// then finish. Return `Err` at the first I/O error. |
| fn from_file<P>(self, path: P) -> io::Result<Self::Output> |
| where |
| Self: Sized, |
| P: AsRef<Path>, |
| F: fmt::SliceFormat<Slice = [u8]>, |
| { |
| self.read_from(&mut File::open(path)?) |
| } |
| } |
| |
| /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, |
| /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
| /// and emits Unicode (`StrTendril`). |
| /// |
| /// This does not allocate memory: the output is either subtendrils on the input, |
| /// on inline tendrils for a single code point. |
| pub struct Utf8LossyDecoder<Sink, A = NonAtomic> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| pub inner_sink: Sink, |
| incomplete: Option<utf8::Incomplete>, |
| marker: PhantomData<A>, |
| } |
| |
| impl<Sink, A> Utf8LossyDecoder<Sink, A> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| /// Create a new incremental UTF-8 decoder. |
| #[inline] |
| pub fn new(inner_sink: Sink) -> Self { |
| Utf8LossyDecoder { |
| inner_sink: inner_sink, |
| incomplete: None, |
| marker: PhantomData, |
| } |
| } |
| } |
| |
| impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| #[inline] |
| fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { |
| // FIXME: remove take() and map() when non-lexical borrows are stable. |
| if let Some(mut incomplete) = self.incomplete.take() { |
| let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { |
| match result { |
| Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), |
| Err(_) => { |
| self.inner_sink.error("invalid byte sequence".into()); |
| self.inner_sink |
| .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
| } |
| } |
| t.len() - rest.len() |
| }); |
| match resume_at { |
| None => { |
| self.incomplete = Some(incomplete); |
| return; |
| } |
| Some(resume_at) => t.pop_front(resume_at as u32), |
| } |
| } |
| while !t.is_empty() { |
| let unborrowed_result = match utf8::decode(&t) { |
| Ok(s) => { |
| debug_assert!(s.as_ptr() == t.as_ptr()); |
| debug_assert!(s.len() == t.len()); |
| Ok(()) |
| } |
| Err(utf8::DecodeError::Invalid { |
| valid_prefix, |
| invalid_sequence, |
| .. |
| }) => { |
| debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
| debug_assert!(valid_prefix.len() <= t.len()); |
| Err(( |
| valid_prefix.len(), |
| Err(valid_prefix.len() + invalid_sequence.len()), |
| )) |
| } |
| Err(utf8::DecodeError::Incomplete { |
| valid_prefix, |
| incomplete_suffix, |
| }) => { |
| debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
| debug_assert!(valid_prefix.len() <= t.len()); |
| Err((valid_prefix.len(), Ok(incomplete_suffix))) |
| } |
| }; |
| match unborrowed_result { |
| Ok(()) => { |
| unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } |
| return; |
| } |
| Err((valid_len, and_then)) => { |
| if valid_len > 0 { |
| let subtendril = t.subtendril(0, valid_len as u32); |
| unsafe { |
| self.inner_sink |
| .process(subtendril.reinterpret_without_validating()) |
| } |
| } |
| match and_then { |
| Ok(incomplete) => { |
| self.incomplete = Some(incomplete); |
| return; |
| } |
| Err(offset) => { |
| self.inner_sink.error("invalid byte sequence".into()); |
| self.inner_sink |
| .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
| t.pop_front(offset as u32); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| #[inline] |
| fn error(&mut self, desc: Cow<'static, str>) { |
| self.inner_sink.error(desc); |
| } |
| |
| type Output = Sink::Output; |
| |
| #[inline] |
| fn finish(mut self) -> Sink::Output { |
| if self.incomplete.is_some() { |
| self.inner_sink |
| .error("incomplete byte sequence at end of stream".into()); |
| self.inner_sink |
| .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
| } |
| self.inner_sink.finish() |
| } |
| } |
| |
| /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, |
| /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
| /// and emits Unicode (`StrTendril`). |
| /// |
| /// This allocates new tendrils for encodings other than UTF-8. |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| pub struct LossyDecoder<Sink, A = NonAtomic> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| inner: LossyDecoderInner<Sink, A>, |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| enum LossyDecoderInner<Sink, A> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| Utf8(Utf8LossyDecoder<Sink, A>), |
| #[cfg(feature = "encoding")] |
| Encoding(Box<encoding::RawDecoder>, Sink), |
| #[cfg(feature = "encoding_rs")] |
| EncodingRs(encoding_rs::Decoder, Sink), |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| impl<Sink, A> LossyDecoder<Sink, A> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| /// Create a new incremental decoder using the encoding crate. |
| #[cfg(feature = "encoding")] |
| #[inline] |
| pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { |
| if encoding.name() == "utf-8" { |
| LossyDecoder::utf8(sink) |
| } else { |
| LossyDecoder { |
| inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), |
| } |
| } |
| } |
| |
| /// Create a new incremental decoder using the encoding_rs crate. |
| #[cfg(feature = "encoding_rs")] |
| #[inline] |
| pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { |
| if encoding == encoding_rs::UTF_8 { |
| return Self::utf8(sink); |
| } |
| Self { |
| inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), |
| } |
| } |
| |
| /// Create a new incremental decoder for the UTF-8 encoding. |
| /// |
| /// This is useful for content that is known at run-time to be UTF-8 |
| /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) |
| #[inline] |
| pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { |
| LossyDecoder { |
| inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), |
| } |
| } |
| |
| /// Give a reference to the inner sink. |
| pub fn inner_sink(&self) -> &Sink { |
| match self.inner { |
| LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, |
| #[cfg(feature = "encoding")] |
| LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, |
| #[cfg(feature = "encoding_rs")] |
| LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, |
| } |
| } |
| |
| /// Give a mutable reference to the inner sink. |
| pub fn inner_sink_mut(&mut self) -> &mut Sink { |
| match self.inner { |
| LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, |
| #[cfg(feature = "encoding")] |
| LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, |
| #[cfg(feature = "encoding_rs")] |
| LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, |
| } |
| } |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> |
| where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| #[inline] |
| fn process(&mut self, t: Tendril<fmt::Bytes, A>) { |
| match self.inner { |
| LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), |
| #[cfg(feature = "encoding")] |
| LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { |
| let mut out = Tendril::new(); |
| let mut t = t; |
| loop { |
| match decoder.raw_feed(&*t, &mut out) { |
| (_, Some(err)) => { |
| out.push_char('\u{fffd}'); |
| sink.error(err.cause); |
| debug_assert!(err.upto >= 0); |
| t.pop_front(err.upto as u32); |
| // continue loop and process remainder of t |
| } |
| (_, None) => break, |
| } |
| } |
| if out.len() > 0 { |
| sink.process(out); |
| } |
| } |
| #[cfg(feature = "encoding_rs")] |
| LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { |
| if t.is_empty() { |
| return; |
| } |
| decode_to_sink(t, decoder, sink, false); |
| } |
| } |
| } |
| |
| #[inline] |
| fn error(&mut self, desc: Cow<'static, str>) { |
| match self.inner { |
| LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), |
| #[cfg(feature = "encoding")] |
| LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), |
| #[cfg(feature = "encoding_rs")] |
| LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), |
| } |
| } |
| |
| type Output = Sink::Output; |
| |
| #[inline] |
| fn finish(self) -> Sink::Output { |
| match self.inner { |
| LossyDecoderInner::Utf8(utf8) => return utf8.finish(), |
| #[cfg(feature = "encoding")] |
| LossyDecoderInner::Encoding(mut decoder, mut sink) => { |
| let mut out = Tendril::new(); |
| if let Some(err) = decoder.raw_finish(&mut out) { |
| out.push_char('\u{fffd}'); |
| sink.error(err.cause); |
| } |
| if out.len() > 0 { |
| sink.process(out); |
| } |
| sink.finish() |
| } |
| #[cfg(feature = "encoding_rs")] |
| LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { |
| decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); |
| sink.finish() |
| } |
| } |
| } |
| } |
| |
| #[cfg(feature = "encoding_rs")] |
| fn decode_to_sink<Sink, A>( |
| mut t: Tendril<fmt::Bytes, A>, |
| decoder: &mut encoding_rs::Decoder, |
| sink: &mut Sink, |
| last: bool, |
| ) where |
| Sink: TendrilSink<fmt::UTF8, A>, |
| A: Atomicity, |
| { |
| loop { |
| let mut out = <Tendril<fmt::Bytes, A>>::new(); |
| let max_len = decoder |
| .max_utf8_buffer_length_without_replacement(t.len()) |
| .unwrap_or(8192); |
| unsafe { |
| out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); |
| } |
| let (result, bytes_read, bytes_written) = |
| decoder.decode_to_utf8_without_replacement(&t, &mut out, last); |
| if bytes_written > 0 { |
| sink.process(unsafe { |
| out.subtendril(0, bytes_written as u32) |
| .reinterpret_without_validating() |
| }); |
| } |
| match result { |
| DecoderResult::InputEmpty => return, |
| DecoderResult::OutputFull => {} |
| DecoderResult::Malformed(_, _) => { |
| sink.error(Cow::Borrowed("invalid sequence")); |
| sink.process("\u{FFFD}".into()); |
| } |
| } |
| t.pop_front(bytes_read as u32); |
| if t.is_empty() { |
| return; |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::{TendrilSink, Utf8LossyDecoder}; |
| use fmt; |
| use std::borrow::Cow; |
| use tendril::{Atomicity, NonAtomic, Tendril}; |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| use super::LossyDecoder; |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| use tendril::SliceExt; |
| |
| #[cfg(feature = "encoding")] |
| use encoding::all as enc; |
| #[cfg(feature = "encoding_rs")] |
| use encoding_rs as enc_rs; |
| |
| struct Accumulate<A> |
| where |
| A: Atomicity, |
| { |
| tendrils: Vec<Tendril<fmt::UTF8, A>>, |
| errors: Vec<String>, |
| } |
| |
| impl<A> Accumulate<A> |
| where |
| A: Atomicity, |
| { |
| fn new() -> Accumulate<A> { |
| Accumulate { |
| tendrils: vec![], |
| errors: vec![], |
| } |
| } |
| } |
| |
| impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> |
| where |
| A: Atomicity, |
| { |
| fn process(&mut self, t: Tendril<fmt::UTF8, A>) { |
| self.tendrils.push(t); |
| } |
| |
| fn error(&mut self, desc: Cow<'static, str>) { |
| self.errors.push(desc.into_owned()); |
| } |
| |
| type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); |
| |
| fn finish(self) -> Self::Output { |
| (self.tendrils, self.errors) |
| } |
| } |
| |
| fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { |
| let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
| let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); |
| assert_eq!( |
| expected, |
| &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() |
| ); |
| assert_eq!(errs, errors.len()); |
| } |
| |
| #[test] |
| fn utf8() { |
| check_utf8(&[], &[], 0); |
| check_utf8(&[b""], &[], 0); |
| check_utf8(&[b"xyz"], &["xyz"], 0); |
| check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); |
| |
| check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); |
| check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); |
| check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); |
| check_utf8( |
| &[b"xy\xEA", b"\x99", b"\xAEzw"], |
| &["xy", "\u{a66e}z", "w"], |
| 0, |
| ); |
| check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); |
| check_utf8( |
| &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], |
| &["\u{a66e}"], |
| 0, |
| ); |
| |
| check_utf8( |
| &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], |
| &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], |
| 4, |
| ); |
| check_utf8( |
| &[b"xy\xEA\x99", b"\xFFz"], |
| &["xy", "\u{fffd}", "\u{fffd}", "z"], |
| 2, |
| ); |
| |
| check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); |
| check_utf8( |
| &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], |
| &["ő", "ő", "ő"], |
| 0, |
| ); |
| check_utf8( |
| &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], |
| &["ő", "ő", "ő"], |
| 0, |
| ); |
| check_utf8( |
| &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], |
| &["ő", "\u{fffd}", "\u{fffd}", "ő"], |
| 2, |
| ); |
| |
| // incomplete char at end of input |
| check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); |
| check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| fn check_decode( |
| mut decoder: LossyDecoder<Accumulate<NonAtomic>>, |
| input: &[&[u8]], |
| expected: &str, |
| errs: usize, |
| ) { |
| for x in input { |
| decoder.process(x.to_tendril()); |
| } |
| let (tendrils, errors) = decoder.finish(); |
| let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); |
| for t in tendrils { |
| tendril.push_tendril(&t); |
| } |
| assert_eq!(expected, &*tendril); |
| assert_eq!(errs, errors.len()); |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; |
| |
| #[cfg(any(feature = "encoding"))] |
| const ASCII: Tests = &[ |
| (&[], "", 0), |
| (&[b""], "", 0), |
| (&[b"xyz"], "xyz", 0), |
| (&[b"xy", b"", b"", b"z"], "xyz", 0), |
| (&[b"x", b"y", b"z"], "xyz", 0), |
| (&[b"\xFF"], "\u{fffd}", 1), |
| (&[b"x\xC0yz"], "x\u{fffd}yz", 1), |
| (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), |
| (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), |
| ]; |
| |
| #[cfg(feature = "encoding")] |
| #[test] |
| fn decode_ascii() { |
| for &(input, expected, errs) in ASCII { |
| let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| const UTF_8: Tests = &[ |
| (&[], "", 0), |
| (&[b""], "", 0), |
| (&[b"xyz"], "xyz", 0), |
| (&[b"x", b"y", b"z"], "xyz", 0), |
| (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), |
| (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), |
| (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), |
| (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), |
| (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), |
| ( |
| &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], |
| "\u{a66e}", |
| 0, |
| ), |
| (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), |
| ( |
| &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], |
| "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", |
| 4, |
| ), |
| (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), |
| // incomplete char at end of input |
| (&[b"\xC0"], "\u{fffd}", 1), |
| (&[b"\xEA\x99"], "\u{fffd}", 1), |
| ]; |
| |
| #[cfg(feature = "encoding")] |
| #[test] |
| fn decode_utf8() { |
| for &(input, expected, errs) in UTF_8 { |
| let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[cfg(feature = "encoding_rs")] |
| #[test] |
| fn decode_utf8_encoding_rs() { |
| for &(input, expected, errs) in UTF_8 { |
| let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| const KOI8_U: Tests = &[ |
| (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), |
| (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), |
| (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), |
| ( |
| &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], |
| "Энергия", |
| 0, |
| ), |
| ]; |
| |
| #[cfg(feature = "encoding")] |
| #[test] |
| fn decode_koi8_u() { |
| for &(input, expected, errs) in KOI8_U { |
| let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[cfg(feature = "encoding_rs")] |
| #[test] |
| fn decode_koi8_u_encoding_rs() { |
| for &(input, expected, errs) in KOI8_U { |
| let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
| const WINDOWS_949: Tests = &[ |
| (&[], "", 0), |
| (&[b""], "", 0), |
| (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), |
| (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), |
| (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), |
| ( |
| &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], |
| "안녕하세요", |
| 0, |
| ), |
| (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), |
| (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), |
| (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), |
| ]; |
| |
| #[cfg(feature = "encoding")] |
| #[test] |
| fn decode_windows_949() { |
| for &(input, expected, errs) in WINDOWS_949 { |
| let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[cfg(feature = "encoding_rs")] |
| #[test] |
| fn decode_windows_949_encoding_rs() { |
| for &(input, expected, errs) in WINDOWS_949 { |
| let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); |
| check_decode(decoder, input, expected, errs); |
| } |
| } |
| |
| #[test] |
| fn read_from() { |
| let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
| let mut bytes: &[u8] = b"foo\xffbar"; |
| let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); |
| assert_eq!( |
| &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), |
| &["foo", "\u{FFFD}", "bar"] |
| ); |
| assert_eq!(errors, &["invalid byte sequence"]); |
| } |
| } |