| // Copyright Mozilla Foundation. See the COPYRIGHT |
| // file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| macro_rules! decoder_function { |
| ($preamble:block, |
| $loop_preable:block, |
| $eof:block, |
| $body:block, |
| $slf:ident, |
| $src_consumed:ident, |
| $dest:ident, |
| $source:ident, |
| $b:ident, |
| $destination_handle:ident, |
| $unread_handle:ident, |
| $destination_check:ident, |
| $name:ident, |
| $code_unit:ty, |
| $dest_struct:ident) => ( |
| pub fn $name(&mut $slf, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool) |
| -> (DecoderResult, usize, usize) { |
| let mut $source = ByteSource::new(src); |
| let mut $dest = $dest_struct::new(dst); |
| loop { // TODO: remove this loop |
| { |
| // Start non-boilerplate |
| $preamble |
| // End non-boilerplate |
| } |
| loop { |
| { |
| $loop_preable |
| } |
| match $source.check_available() { |
| Space::Full($src_consumed) => { |
| if last { |
| // Start non-boilerplate |
| $eof |
| // End non-boilerplate |
| } |
| return (DecoderResult::InputEmpty, $src_consumed, $dest.written()); |
| } |
| Space::Available(source_handle) => { |
| match $dest.$destination_check() { |
| Space::Full(dst_written) => { |
| return (DecoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available($destination_handle) => { |
| let ($b, $unread_handle) = source_handle.read(); |
| // Start non-boilerplate |
| $body |
| // End non-boilerplate |
| } |
| } |
| } |
| } |
| } |
| } |
| }); |
| } |
| |
| macro_rules! decoder_functions { |
| ( |
| $preamble:block, |
| $loop_preable:block, |
| $eof:block, |
| $body:block, |
| $slf:ident, |
| $src_consumed:ident, |
| $dest:ident, |
| $source:ident, |
| $b:ident, |
| $destination_handle:ident, |
| $unread_handle:ident, |
| $destination_check:ident |
| ) => { |
| decoder_function!( |
| $preamble, |
| $loop_preable, |
| $eof, |
| $body, |
| $slf, |
| $src_consumed, |
| $dest, |
| $source, |
| $b, |
| $destination_handle, |
| $unread_handle, |
| $destination_check, |
| decode_to_utf8_raw, |
| u8, |
| Utf8Destination |
| ); |
| decoder_function!( |
| $preamble, |
| $loop_preable, |
| $eof, |
| $body, |
| $slf, |
| $src_consumed, |
| $dest, |
| $source, |
| $b, |
| $destination_handle, |
| $unread_handle, |
| $destination_check, |
| decode_to_utf16_raw, |
| u16, |
| Utf16Destination |
| ); |
| }; |
| } |
| |
| macro_rules! ascii_compatible_two_byte_decoder_function { |
| ($lead:block, |
| $trail:block, |
| $slf:ident, |
| $non_ascii:ident, |
| $byte:ident, |
| $lead_minus_offset:ident, |
| $unread_handle_trail:ident, |
| $source:ident, |
| $handle:ident, |
| $outermost:tt, |
| $copy_ascii:ident, |
| $destination_check:ident, |
| $name:ident, |
| $code_unit:ty, |
| $dest_struct:ident, |
| $ascii_punctuation:expr) => ( |
| pub fn $name(&mut $slf, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool) |
| -> (DecoderResult, usize, usize) { |
| let mut $source = ByteSource::new(src); |
| let mut dest_prolog = $dest_struct::new(dst); |
| let dest = match $slf.lead { |
| Some(lead) => { |
| let $lead_minus_offset = lead; |
| $slf.lead = None; |
| // Since we don't have `goto` we could use to jump into the trail |
| // handling part of the main loop, we need to repeat trail handling |
| // here. |
| match $source.check_available() { |
| Space::Full(src_consumed_prolog) => { |
| if last { |
| return (DecoderResult::Malformed(1, 0), |
| src_consumed_prolog, |
| dest_prolog.written()); |
| } |
| return (DecoderResult::InputEmpty, src_consumed_prolog, dest_prolog.written()); |
| } |
| Space::Available(source_handle_prolog) => { |
| match dest_prolog.$destination_check() { |
| Space::Full(dst_written_prolog) => { |
| return (DecoderResult::OutputFull, |
| source_handle_prolog.consumed(), |
| dst_written_prolog); |
| } |
| Space::Available($handle) => { |
| let ($byte, $unread_handle_trail) = source_handle_prolog.read(); |
| // Start non-boilerplate |
| $trail |
| // End non-boilerplate |
| } |
| } |
| } |
| } |
| }, |
| None => { |
| &mut dest_prolog |
| } |
| }; |
| $outermost: loop { |
| match dest.$copy_ascii(&mut $source) { |
| CopyAsciiResult::Stop(ret) => return ret, |
| CopyAsciiResult::GoOn((mut $non_ascii, mut $handle)) => { |
| 'middle: loop { |
| let dest_again = { |
| let $lead_minus_offset = { |
| // Start non-boilerplate |
| $lead |
| // End non-boilerplate |
| }; |
| match $source.check_available() { |
| Space::Full(src_consumed_trail) => { |
| if last { |
| return (DecoderResult::Malformed(1, 0), |
| src_consumed_trail, |
| $handle.written()); |
| } |
| $slf.lead = Some($lead_minus_offset); |
| return (DecoderResult::InputEmpty, |
| src_consumed_trail, |
| $handle.written()); |
| } |
| Space::Available(source_handle_trail) => { |
| let ($byte, $unread_handle_trail) = source_handle_trail.read(); |
| // Start non-boilerplate |
| $trail |
| // End non-boilerplate |
| } |
| } |
| }; |
| match $source.check_available() { |
| Space::Full(src_consumed) => { |
| return (DecoderResult::InputEmpty, |
| src_consumed, |
| dest_again.written()); |
| } |
| Space::Available(source_handle) => { |
| match dest_again.$destination_check() { |
| Space::Full(dst_written) => { |
| return (DecoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available(mut destination_handle) => { |
| let (mut b, unread_handle) = source_handle.read(); |
| let source_again = unread_handle.commit(); |
| 'innermost: loop { |
| if b > 127 { |
| $non_ascii = b; |
| $handle = destination_handle; |
| continue 'middle; |
| } |
| // Testing on Haswell says that we should write the |
| // byte unconditionally instead of trying to unread it |
| // to make it part of the next SIMD stride. |
| let dest_again_again = |
| destination_handle.write_ascii(b); |
| if $ascii_punctuation && b < 60 { |
| // We've got punctuation |
| match source_again.check_available() { |
| Space::Full(src_consumed_again) => { |
| return (DecoderResult::InputEmpty, |
| src_consumed_again, |
| dest_again_again.written()); |
| } |
| Space::Available(source_handle_again) => { |
| match dest_again_again.$destination_check() { |
| Space::Full(dst_written_again) => { |
| return (DecoderResult::OutputFull, |
| source_handle_again.consumed(), |
| dst_written_again); |
| } |
| Space::Available(destination_handle_again) => { |
| { |
| let (b_again, _unread_handle_again) = |
| source_handle_again.read(); |
| b = b_again; |
| destination_handle = destination_handle_again; |
| continue 'innermost; |
| } |
| } |
| } |
| } |
| } |
| } |
| // We've got markup or ASCII text |
| continue $outermost; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| }); |
| } |
| |
| macro_rules! ascii_compatible_two_byte_decoder_functions { |
| ( |
| $lead:block, |
| $trail:block, |
| $slf:ident, |
| $non_ascii:ident, |
| $byte:ident, |
| $lead_minus_offset:ident, |
| $unread_handle_trail:ident, |
| $source:ident, |
| $handle:ident, |
| $outermost:tt, |
| $copy_ascii:ident, |
| $destination_check:ident, |
| $ascii_punctuation:expr |
| ) => { |
| ascii_compatible_two_byte_decoder_function!( |
| $lead, |
| $trail, |
| $slf, |
| $non_ascii, |
| $byte, |
| $lead_minus_offset, |
| $unread_handle_trail, |
| $source, |
| $handle, |
| $outermost, |
| $copy_ascii, |
| $destination_check, |
| decode_to_utf8_raw, |
| u8, |
| Utf8Destination, |
| $ascii_punctuation |
| ); |
| ascii_compatible_two_byte_decoder_function!( |
| $lead, |
| $trail, |
| $slf, |
| $non_ascii, |
| $byte, |
| $lead_minus_offset, |
| $unread_handle_trail, |
| $source, |
| $handle, |
| $outermost, |
| $copy_ascii, |
| $destination_check, |
| decode_to_utf16_raw, |
| u16, |
| Utf16Destination, |
| $ascii_punctuation |
| ); |
| }; |
| } |
| |
| macro_rules! gb18030_decoder_function { |
| ($first_body:block, |
| $second_body:block, |
| $third_body:block, |
| $fourth_body:block, |
| $slf:ident, |
| $non_ascii:ident, |
| $first_minus_offset:ident, |
| $second:ident, |
| $second_minus_offset:ident, |
| $unread_handle_second:ident, |
| $third:ident, |
| $third_minus_offset:ident, |
| $unread_handle_third:ident, |
| $fourth:ident, |
| $fourth_minus_offset:ident, |
| $unread_handle_fourth:ident, |
| $source:ident, |
| $handle:ident, |
| $outermost:tt, |
| $name:ident, |
| $code_unit:ty, |
| $dest_struct:ident) => ( |
| #[cfg_attr(feature = "cargo-clippy", allow(never_loop))] |
| pub fn $name(&mut $slf, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool) |
| -> (DecoderResult, usize, usize) { |
| let mut $source = ByteSource::new(src); |
| let mut dest = $dest_struct::new(dst); |
| { |
| if let Some(ascii) = $slf.pending_ascii { |
| match dest.check_space_bmp() { |
| Space::Full(_) => { |
| return (DecoderResult::OutputFull, 0, 0); |
| } |
| Space::Available(pending_ascii_handle) => { |
| $slf.pending_ascii = None; |
| pending_ascii_handle.write_ascii(ascii); |
| } |
| } |
| } |
| } |
| while !$slf.pending.is_none() { |
| match $source.check_available() { |
| Space::Full(src_consumed) => { |
| if last { |
| // Start non-boilerplate |
| let count = $slf.pending.count(); |
| $slf.pending = Gb18030Pending::None; |
| return (DecoderResult::Malformed(count as u8, 0), |
| src_consumed, |
| dest.written()); |
| // End non-boilerplate |
| } |
| return (DecoderResult::InputEmpty, src_consumed, dest.written()); |
| } |
| Space::Available(source_handle) => { |
| match dest.check_space_astral() { |
| Space::Full(dst_written) => { |
| return (DecoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available($handle) => { |
| let (byte, unread_handle) = source_handle.read(); |
| match $slf.pending { |
| Gb18030Pending::One($first_minus_offset) => { |
| $slf.pending = Gb18030Pending::None; |
| let $second = byte; |
| let $unread_handle_second = unread_handle; |
| // If second is between 0x40 and 0x7E, |
| // inclusive, subtract offset 0x40. Else if |
| // second is between 0x80 and 0xFE, inclusive, |
| // subtract offset 0x41. In both cases, |
| // handle as a two-byte sequence. |
| // Else if second is between 0x30 and 0x39, |
| // inclusive, subtract offset 0x30 and |
| // handle as a four-byte sequence. |
| let $second_minus_offset = $second.wrapping_sub(0x30); |
| // It's not optimal to do this check first, |
| // but this results in more readable code. |
| if $second_minus_offset > (0x39 - 0x30) { |
| // Start non-boilerplate |
| $second_body |
| // End non-boilerplate |
| } else { |
| // Four-byte! |
| $slf.pending = Gb18030Pending::Two($first_minus_offset, |
| $second_minus_offset); |
| $handle.commit() |
| } |
| } |
| Gb18030Pending::Two($first_minus_offset, $second_minus_offset) => { |
| $slf.pending = Gb18030Pending::None; |
| let $third = byte; |
| let $unread_handle_third = unread_handle; |
| let $third_minus_offset = { |
| // Start non-boilerplate |
| $third_body |
| // End non-boilerplate |
| }; |
| $slf.pending = Gb18030Pending::Three($first_minus_offset, |
| $second_minus_offset, |
| $third_minus_offset); |
| $handle.commit() |
| } |
| Gb18030Pending::Three($first_minus_offset, |
| $second_minus_offset, |
| $third_minus_offset) => { |
| $slf.pending = Gb18030Pending::None; |
| let $fourth = byte; |
| let $unread_handle_fourth = unread_handle; |
| // Start non-boilerplate |
| $fourth_body |
| // End non-boilerplate |
| } |
| Gb18030Pending::None => unreachable!("Checked in loop condition"), |
| }; |
| } |
| } |
| } |
| } |
| } |
| $outermost: loop { |
| match dest.copy_ascii_from_check_space_astral(&mut $source) { |
| CopyAsciiResult::Stop(ret) => return ret, |
| CopyAsciiResult::GoOn((mut $non_ascii, mut $handle)) => { |
| 'middle: loop { |
| let dest_again = { |
| let $first_minus_offset = { |
| // Start non-boilerplate |
| $first_body |
| // End non-boilerplate |
| }; |
| match $source.check_available() { |
| Space::Full(src_consumed_trail) => { |
| if last { |
| return (DecoderResult::Malformed(1, 0), |
| src_consumed_trail, |
| $handle.written()); |
| } |
| $slf.pending = Gb18030Pending::One($first_minus_offset); |
| return (DecoderResult::InputEmpty, |
| src_consumed_trail, |
| $handle.written()); |
| } |
| Space::Available(source_handle_trail) => { |
| let ($second, $unread_handle_second) = source_handle_trail.read(); |
| // Start non-boilerplate |
| // If second is between 0x40 and 0x7E, |
| // inclusive, subtract offset 0x40. Else if |
| // second is between 0x80 and 0xFE, inclusive, |
| // subtract offset 0x41. In both cases, |
| // handle as a two-byte sequence. |
| // Else if second is between 0x30 and 0x39, |
| // inclusive, subtract offset 0x30 and |
| // handle as a four-byte sequence. |
| let $second_minus_offset = $second.wrapping_sub(0x30); |
| // It's not optimal to do this check first, |
| // but this results in more readable code. |
| if $second_minus_offset > (0x39 - 0x30) { |
| // Start non-boilerplate |
| $second_body |
| // End non-boilerplate |
| } else { |
| // Four-byte! |
| match $unread_handle_second.commit().check_available() { |
| Space::Full(src_consumed_third) => { |
| if last { |
| return (DecoderResult::Malformed(2, 0), |
| src_consumed_third, |
| $handle.written()); |
| } |
| $slf.pending = |
| Gb18030Pending::Two($first_minus_offset, |
| $second_minus_offset); |
| return (DecoderResult::InputEmpty, |
| src_consumed_third, |
| $handle.written()); |
| } |
| Space::Available(source_handle_third) => { |
| let ($third, $unread_handle_third) = |
| source_handle_third.read(); |
| let $third_minus_offset = { |
| // Start non-boilerplate |
| $third_body |
| // End non-boilerplate |
| }; |
| match $unread_handle_third.commit() |
| .check_available() { |
| Space::Full(src_consumed_fourth) => { |
| if last { |
| return (DecoderResult::Malformed(3, 0), |
| src_consumed_fourth, |
| $handle.written()); |
| } |
| $slf.pending = Gb18030Pending::Three($first_minus_offset, $second_minus_offset, $third_minus_offset); |
| return (DecoderResult::InputEmpty, |
| src_consumed_fourth, |
| $handle.written()); |
| } |
| Space::Available(source_handle_fourth) => { |
| let ($fourth, $unread_handle_fourth) = |
| source_handle_fourth.read(); |
| // Start non-boilerplate |
| $fourth_body |
| // End non-boilerplate |
| } |
| } |
| } |
| } |
| } |
| // End non-boilerplate |
| } |
| } |
| }; |
| match $source.check_available() { |
| Space::Full(src_consumed) => { |
| return (DecoderResult::InputEmpty, |
| src_consumed, |
| dest_again.written()); |
| } |
| Space::Available(source_handle) => { |
| match dest_again.check_space_astral() { |
| Space::Full(dst_written) => { |
| return (DecoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available(destination_handle) => { |
| let (b, _) = source_handle.read(); |
| loop { |
| if b > 127 { |
| $non_ascii = b; |
| $handle = destination_handle; |
| continue 'middle; |
| } |
| // Testing on Haswell says that we should write the |
| // byte unconditionally instead of trying to unread it |
| // to make it part of the next SIMD stride. |
| destination_handle.write_ascii(b); |
| // We've got markup or ASCII text |
| continue $outermost; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| }); |
| } |
| |
| macro_rules! gb18030_decoder_functions { |
| ( |
| $first_body:block, |
| $second_body:block, |
| $third_body:block, |
| $fourth_body:block, |
| $slf:ident, |
| $non_ascii:ident, |
| $first_minus_offset:ident, |
| $second:ident, |
| $second_minus_offset:ident, |
| $unread_handle_second:ident, |
| $third:ident, |
| $third_minus_offset:ident, |
| $unread_handle_third:ident, |
| $fourth:ident, |
| $fourth_minus_offset:ident, |
| $unread_handle_fourth:ident, |
| $source:ident, |
| $handle:ident, |
| $outermost:tt |
| ) => { |
| gb18030_decoder_function!( |
| $first_body, |
| $second_body, |
| $third_body, |
| $fourth_body, |
| $slf, |
| $non_ascii, |
| $first_minus_offset, |
| $second, |
| $second_minus_offset, |
| $unread_handle_second, |
| $third, |
| $third_minus_offset, |
| $unread_handle_third, |
| $fourth, |
| $fourth_minus_offset, |
| $unread_handle_fourth, |
| $source, |
| $handle, |
| $outermost, |
| decode_to_utf8_raw, |
| u8, |
| Utf8Destination |
| ); |
| gb18030_decoder_function!( |
| $first_body, |
| $second_body, |
| $third_body, |
| $fourth_body, |
| $slf, |
| $non_ascii, |
| $first_minus_offset, |
| $second, |
| $second_minus_offset, |
| $unread_handle_second, |
| $third, |
| $third_minus_offset, |
| $unread_handle_third, |
| $fourth, |
| $fourth_minus_offset, |
| $unread_handle_fourth, |
| $source, |
| $handle, |
| $outermost, |
| decode_to_utf16_raw, |
| u16, |
| Utf16Destination |
| ); |
| }; |
| } |
| |
| macro_rules! euc_jp_decoder_function { |
| ($jis0802_trail_body:block, |
| $jis0812_lead_body:block, |
| $jis0812_trail_body:block, |
| $half_width_katakana_body:block, |
| $slf:ident, |
| $non_ascii:ident, |
| $jis0208_lead_minus_offset:ident, |
| $byte:ident, |
| $unread_handle_trail:ident, |
| $jis0212_lead_minus_offset:ident, |
| $lead:ident, |
| $unread_handle_jis0212:ident, |
| $source:ident, |
| $handle:ident, |
| $name:ident, |
| $code_unit:ty, |
| $dest_struct:ident) => ( |
| #[cfg_attr(feature = "cargo-clippy", allow(never_loop))] |
| pub fn $name(&mut $slf, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool) |
| -> (DecoderResult, usize, usize) { |
| let mut $source = ByteSource::new(src); |
| let mut dest = $dest_struct::new(dst); |
| while !$slf.pending.is_none() { |
| match $source.check_available() { |
| Space::Full(src_consumed) => { |
| if last { |
| // Start non-boilerplate |
| let count = $slf.pending.count(); |
| $slf.pending = EucJpPending::None; |
| return (DecoderResult::Malformed(count as u8, 0), |
| src_consumed, |
| dest.written()); |
| // End non-boilerplate |
| } |
| return (DecoderResult::InputEmpty, src_consumed, dest.written()); |
| } |
| Space::Available(source_handle) => { |
| match dest.check_space_bmp() { |
| Space::Full(dst_written) => { |
| return (DecoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available($handle) => { |
| let ($byte, $unread_handle_trail) = source_handle.read(); |
| match $slf.pending { |
| EucJpPending::Jis0208Lead($jis0208_lead_minus_offset) => { |
| $slf.pending = EucJpPending::None; |
| // Start non-boilerplate |
| $jis0802_trail_body |
| // End non-boilerplate |
| } |
| EucJpPending::Jis0212Shift => { |
| $slf.pending = EucJpPending::None; |
| let $lead = $byte; |
| let $unread_handle_jis0212 = $unread_handle_trail; |
| let $jis0212_lead_minus_offset = { |
| // Start non-boilerplate |
| $jis0812_lead_body |
| // End non-boilerplate |
| }; |
| $slf.pending = |
| EucJpPending::Jis0212Lead($jis0212_lead_minus_offset); |
| $handle.commit() |
| } |
| EucJpPending::Jis0212Lead($jis0212_lead_minus_offset) => { |
| $slf.pending = EucJpPending::None; |
| // Start non-boilerplate |
| $jis0812_trail_body |
| // End non-boilerplate |
| } |
| EucJpPending::HalfWidthKatakana => { |
| $slf.pending = EucJpPending::None; |
| // Start non-boilerplate |
| $half_width_katakana_body |
| // End non-boilerplate |
| } |
| EucJpPending::None => unreachable!("Checked in loop condition"), |
| }; |
| } |
| } |
| } |
| } |
| } |
| 'outermost: loop { |
| match dest.copy_ascii_from_check_space_bmp(&mut $source) { |
| CopyAsciiResult::Stop(ret) => return ret, |
| CopyAsciiResult::GoOn((mut $non_ascii, mut $handle)) => { |
| 'middle: loop { |
| let dest_again = { |
| // If lead is between 0xA1 and 0xFE, inclusive, |
| // subtract 0xA1. Else if lead is 0x8E, handle the |
| // next byte as half-width Katakana. Else if lead is |
| // 0x8F, expect JIS 0212. |
| let $jis0208_lead_minus_offset = $non_ascii.wrapping_sub(0xA1); |
| if $jis0208_lead_minus_offset <= (0xFE - 0xA1) { |
| // JIS 0208 |
| match $source.check_available() { |
| Space::Full(src_consumed_trail) => { |
| if last { |
| return (DecoderResult::Malformed(1, 0), |
| src_consumed_trail, |
| $handle.written()); |
| } |
| $slf.pending = |
| EucJpPending::Jis0208Lead($jis0208_lead_minus_offset); |
| return (DecoderResult::InputEmpty, |
| src_consumed_trail, |
| $handle.written()); |
| } |
| Space::Available(source_handle_trail) => { |
| let ($byte, $unread_handle_trail) = |
| source_handle_trail.read(); |
| // Start non-boilerplate |
| $jis0802_trail_body |
| // End non-boilerplate |
| } |
| } |
| } else if $non_ascii == 0x8F { |
| match $source.check_available() { |
| Space::Full(src_consumed_jis0212) => { |
| if last { |
| return (DecoderResult::Malformed(1, 0), |
| src_consumed_jis0212, |
| $handle.written()); |
| } |
| $slf.pending = EucJpPending::Jis0212Shift; |
| return (DecoderResult::InputEmpty, |
| src_consumed_jis0212, |
| $handle.written()); |
| } |
| Space::Available(source_handle_jis0212) => { |
| let ($lead, $unread_handle_jis0212) = |
| source_handle_jis0212.read(); |
| let $jis0212_lead_minus_offset = { |
| // Start non-boilerplate |
| $jis0812_lead_body |
| // End non-boilerplate |
| }; |
| match $unread_handle_jis0212.commit().check_available() { |
| Space::Full(src_consumed_trail) => { |
| if last { |
| return (DecoderResult::Malformed(2, 0), |
| src_consumed_trail, |
| $handle.written()); |
| } |
| $slf.pending = EucJpPending::Jis0212Lead($jis0212_lead_minus_offset); |
| return (DecoderResult::InputEmpty, |
| src_consumed_trail, |
| $handle.written()); |
| } |
| Space::Available(source_handle_trail) => { |
| let ($byte, $unread_handle_trail) = |
| source_handle_trail.read(); |
| // Start non-boilerplate |
| $jis0812_trail_body |
| // End non-boilerplate |
| } |
| } |
| } |
| } |
| } else if $non_ascii == 0x8E { |
| match $source.check_available() { |
| Space::Full(src_consumed_trail) => { |
| if last { |
| return (DecoderResult::Malformed(1, 0), |
| src_consumed_trail, |
| $handle.written()); |
| } |
| $slf.pending = EucJpPending::HalfWidthKatakana; |
| return (DecoderResult::InputEmpty, |
| src_consumed_trail, |
| $handle.written()); |
| } |
| Space::Available(source_handle_trail) => { |
| let ($byte, $unread_handle_trail) = |
| source_handle_trail.read(); |
| // Start non-boilerplate |
| $half_width_katakana_body |
| // End non-boilerplate |
| } |
| } |
| } else { |
| return (DecoderResult::Malformed(1, 0), |
| $source.consumed(), |
| $handle.written()); |
| } |
| }; |
| match $source.check_available() { |
| Space::Full(src_consumed) => { |
| return (DecoderResult::InputEmpty, |
| src_consumed, |
| dest_again.written()); |
| } |
| Space::Available(source_handle) => { |
| match dest_again.check_space_bmp() { |
| Space::Full(dst_written) => { |
| return (DecoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available(destination_handle) => { |
| let (b, _) = source_handle.read(); |
| loop { |
| if b > 127 { |
| $non_ascii = b; |
| $handle = destination_handle; |
| continue 'middle; |
| } |
| // Testing on Haswell says that we should write the |
| // byte unconditionally instead of trying to unread it |
| // to make it part of the next SIMD stride. |
| destination_handle.write_ascii(b); |
| // We've got markup or ASCII text |
| continue 'outermost; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| }); |
| } |
| |
| macro_rules! euc_jp_decoder_functions { |
| ( |
| $jis0802_trail_body:block, |
| $jis0812_lead_body:block, |
| $jis0812_trail_body:block, |
| $half_width_katakana_body:block, |
| $slf:ident, |
| $non_ascii:ident, |
| $jis0208_lead_minus_offset:ident, |
| $byte:ident, |
| $unread_handle_trail:ident, |
| $jis0212_lead_minus_offset:ident, |
| $lead:ident, |
| $unread_handle_jis0212:ident, |
| $source:ident, |
| $handle:ident |
| ) => { |
| euc_jp_decoder_function!( |
| $jis0802_trail_body, |
| $jis0812_lead_body, |
| $jis0812_trail_body, |
| $half_width_katakana_body, |
| $slf, |
| $non_ascii, |
| $jis0208_lead_minus_offset, |
| $byte, |
| $unread_handle_trail, |
| $jis0212_lead_minus_offset, |
| $lead, |
| $unread_handle_jis0212, |
| $source, |
| $handle, |
| decode_to_utf8_raw, |
| u8, |
| Utf8Destination |
| ); |
| euc_jp_decoder_function!( |
| $jis0802_trail_body, |
| $jis0812_lead_body, |
| $jis0812_trail_body, |
| $half_width_katakana_body, |
| $slf, |
| $non_ascii, |
| $jis0208_lead_minus_offset, |
| $byte, |
| $unread_handle_trail, |
| $jis0212_lead_minus_offset, |
| $lead, |
| $unread_handle_jis0212, |
| $source, |
| $handle, |
| decode_to_utf16_raw, |
| u16, |
| Utf16Destination |
| ); |
| }; |
| } |
| |
| macro_rules! encoder_function { |
| ($eof:block, |
| $body:block, |
| $slf:ident, |
| $src_consumed:ident, |
| $source:ident, |
| $dest:ident, |
| $c:ident, |
| $destination_handle:ident, |
| $unread_handle:ident, |
| $destination_check:ident, |
| $name:ident, |
| $input:ty, |
| $source_struct:ident) => ( |
| pub fn $name(&mut $slf, |
| src: &$input, |
| dst: &mut [u8], |
| last: bool) |
| -> (EncoderResult, usize, usize) { |
| let mut $source = $source_struct::new(src); |
| let mut $dest = ByteDestination::new(dst); |
| loop { |
| match $source.check_available() { |
| Space::Full($src_consumed) => { |
| if last { |
| // Start non-boilerplate |
| $eof |
| // End non-boilerplate |
| } |
| return (EncoderResult::InputEmpty, $src_consumed, $dest.written()); |
| } |
| Space::Available(source_handle) => { |
| match $dest.$destination_check() { |
| Space::Full(dst_written) => { |
| return (EncoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available($destination_handle) => { |
| let ($c, $unread_handle) = source_handle.read(); |
| // Start non-boilerplate |
| $body |
| // End non-boilerplate |
| } |
| } |
| } |
| } |
| } |
| }); |
| } |
| |
| macro_rules! encoder_functions { |
| ( |
| $eof:block, |
| $body:block, |
| $slf:ident, |
| $src_consumed:ident, |
| $source:ident, |
| $dest:ident, |
| $c:ident, |
| $destination_handle:ident, |
| $unread_handle:ident, |
| $destination_check:ident |
| ) => { |
| encoder_function!( |
| $eof, |
| $body, |
| $slf, |
| $src_consumed, |
| $source, |
| $dest, |
| $c, |
| $destination_handle, |
| $unread_handle, |
| $destination_check, |
| encode_from_utf8_raw, |
| str, |
| Utf8Source |
| ); |
| encoder_function!( |
| $eof, |
| $body, |
| $slf, |
| $src_consumed, |
| $source, |
| $dest, |
| $c, |
| $destination_handle, |
| $unread_handle, |
| $destination_check, |
| encode_from_utf16_raw, |
| [u16], |
| Utf16Source |
| ); |
| }; |
| } |
| |
| macro_rules! ascii_compatible_encoder_function { |
| ($bmp_body:block, |
| $astral_body:block, |
| $bmp:ident, |
| $astral:ident, |
| $slf:ident, |
| $source:ident, |
| $handle:ident, |
| $copy_ascii:ident, |
| $destination_check:ident, |
| $name:ident, |
| $input:ty, |
| $source_struct:ident, |
| $ascii_punctuation:expr) => ( |
| pub fn $name(&mut $slf, |
| src: &$input, |
| dst: &mut [u8], |
| _last: bool) |
| -> (EncoderResult, usize, usize) { |
| let mut $source = $source_struct::new(src); |
| let mut dest = ByteDestination::new(dst); |
| 'outermost: loop { |
| match $source.$copy_ascii(&mut dest) { |
| CopyAsciiResult::Stop(ret) => return ret, |
| CopyAsciiResult::GoOn((mut non_ascii, mut $handle)) => { |
| 'middle: loop { |
| let dest_again = match non_ascii { |
| NonAscii::BmpExclAscii($bmp) => { |
| // Start non-boilerplate |
| $bmp_body |
| // End non-boilerplate |
| } |
| NonAscii::Astral($astral) => { |
| // Start non-boilerplate |
| $astral_body |
| // End non-boilerplate |
| } |
| }; |
| match $source.check_available() { |
| Space::Full(src_consumed) => { |
| return (EncoderResult::InputEmpty, |
| src_consumed, |
| dest_again.written()); |
| } |
| Space::Available(source_handle) => { |
| match dest_again.$destination_check() { |
| Space::Full(dst_written) => { |
| return (EncoderResult::OutputFull, |
| source_handle.consumed(), |
| dst_written); |
| } |
| Space::Available(mut destination_handle) => { |
| let (mut c, unread_handle) = source_handle.read_enum(); |
| let source_again = unread_handle.commit(); |
| 'innermost: loop { |
| let ascii = match c { |
| Unicode::NonAscii(non_ascii_again) => { |
| non_ascii = non_ascii_again; |
| $handle = destination_handle; |
| continue 'middle; |
| } |
| Unicode::Ascii(a) => a, |
| }; |
| // Testing on Haswell says that we should write the |
| // byte unconditionally instead of trying to unread it |
| // to make it part of the next SIMD stride. |
| let dest_again_again = |
| destination_handle.write_one(ascii); |
| if $ascii_punctuation && ascii < 60 { |
| // We've got punctuation |
| match source_again.check_available() { |
| Space::Full(src_consumed_again) => { |
| return (EncoderResult::InputEmpty, |
| src_consumed_again, |
| dest_again_again.written()); |
| } |
| Space::Available(source_handle_again) => { |
| match dest_again_again.$destination_check() { |
| Space::Full(dst_written_again) => { |
| return (EncoderResult::OutputFull, |
| source_handle_again.consumed(), |
| dst_written_again); |
| } |
| Space::Available(destination_handle_again) => { |
| { |
| let (c_again, _unread_handle_again) = |
| source_handle_again.read_enum(); |
| c = c_again; |
| destination_handle = destination_handle_again; |
| continue 'innermost; |
| } |
| } |
| } |
| } |
| } |
| } |
| // We've got markup or ASCII text |
| continue 'outermost; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| }); |
| } |
| |
| macro_rules! ascii_compatible_encoder_functions { |
| ( |
| $bmp_body:block, |
| $astral_body:block, |
| $bmp:ident, |
| $astral:ident, |
| $slf:ident, |
| $source:ident, |
| $handle:ident, |
| $copy_ascii:ident, |
| $destination_check:ident, |
| $ascii_punctuation:expr |
| ) => { |
| ascii_compatible_encoder_function!( |
| $bmp_body, |
| $astral_body, |
| $bmp, |
| $astral, |
| $slf, |
| $source, |
| $handle, |
| $copy_ascii, |
| $destination_check, |
| encode_from_utf8_raw, |
| str, |
| Utf8Source, |
| $ascii_punctuation |
| ); |
| ascii_compatible_encoder_function!( |
| $bmp_body, |
| $astral_body, |
| $bmp, |
| $astral, |
| $slf, |
| $source, |
| $handle, |
| $copy_ascii, |
| $destination_check, |
| encode_from_utf16_raw, |
| [u16], |
| Utf16Source, |
| $ascii_punctuation |
| ); |
| }; |
| } |
| |
| macro_rules! ascii_compatible_bmp_encoder_function { |
| ( |
| $bmp_body:block, |
| $bmp:ident, |
| $slf:ident, |
| $source:ident, |
| $handle:ident, |
| $copy_ascii:ident, |
| $destination_check:ident, |
| $name:ident, |
| $input:ty, |
| $source_struct:ident, |
| $ascii_punctuation:expr |
| ) => { |
| ascii_compatible_encoder_function!( |
| $bmp_body, |
| { |
| return ( |
| EncoderResult::Unmappable(astral), |
| $source.consumed(), |
| $handle.written(), |
| ); |
| }, |
| $bmp, |
| astral, |
| $slf, |
| $source, |
| $handle, |
| $copy_ascii, |
| $destination_check, |
| $name, |
| $input, |
| $source_struct, |
| $ascii_punctuation |
| ); |
| }; |
| } |
| |
| macro_rules! ascii_compatible_bmp_encoder_functions { |
| ( |
| $bmp_body:block, |
| $bmp:ident, |
| $slf:ident, |
| $source:ident, |
| $handle:ident, |
| $copy_ascii:ident, |
| $destination_check:ident, |
| $ascii_punctuation:expr |
| ) => { |
| ascii_compatible_encoder_functions!( |
| $bmp_body, |
| { |
| return ( |
| EncoderResult::Unmappable(astral), |
| $source.consumed(), |
| $handle.written(), |
| ); |
| }, |
| $bmp, |
| astral, |
| $slf, |
| $source, |
| $handle, |
| $copy_ascii, |
| $destination_check, |
| $ascii_punctuation |
| ); |
| }; |
| } |
| |
| macro_rules! public_decode_function{ |
| ($(#[$meta:meta])*, |
| $decode_to_utf:ident, |
| $decode_to_utf_raw:ident, |
| $decode_to_utf_checking_end:ident, |
| $decode_to_utf_after_one_potential_bom_byte:ident, |
| $decode_to_utf_after_two_potential_bom_bytes:ident, |
| $decode_to_utf_checking_end_with_offset:ident, |
| $code_unit:ty) => ( |
| $(#[$meta])* |
| pub fn $decode_to_utf(&mut self, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool) |
| -> (DecoderResult, usize, usize) { |
| let mut offset = 0usize; |
| loop { |
| match self.life_cycle { |
| // The common case. (Post-sniffing.) |
| DecoderLifeCycle::Converting => { |
| return self.$decode_to_utf_checking_end(src, dst, last); |
| } |
| // The rest is all BOM sniffing! |
| DecoderLifeCycle::AtStart => { |
| debug_assert_eq!(offset, 0usize); |
| if src.is_empty() { |
| return (DecoderResult::InputEmpty, 0, 0); |
| } |
| match src[0] { |
| 0xEFu8 => { |
| self.life_cycle = DecoderLifeCycle::SeenUtf8First; |
| offset += 1; |
| continue; |
| } |
| 0xFEu8 => { |
| self.life_cycle = DecoderLifeCycle::SeenUtf16BeFirst; |
| offset += 1; |
| continue; |
| } |
| 0xFFu8 => { |
| self.life_cycle = DecoderLifeCycle::SeenUtf16LeFirst; |
| offset += 1; |
| continue; |
| } |
| _ => { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| continue; |
| } |
| } |
| } |
| DecoderLifeCycle::AtUtf8Start => { |
| debug_assert_eq!(offset, 0usize); |
| if src.is_empty() { |
| return (DecoderResult::InputEmpty, 0, 0); |
| } |
| match src[0] { |
| 0xEFu8 => { |
| self.life_cycle = DecoderLifeCycle::SeenUtf8First; |
| offset += 1; |
| continue; |
| } |
| _ => { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| continue; |
| } |
| } |
| } |
| DecoderLifeCycle::AtUtf16BeStart => { |
| debug_assert_eq!(offset, 0usize); |
| if src.is_empty() { |
| return (DecoderResult::InputEmpty, 0, 0); |
| } |
| match src[0] { |
| 0xFEu8 => { |
| self.life_cycle = DecoderLifeCycle::SeenUtf16BeFirst; |
| offset += 1; |
| continue; |
| } |
| _ => { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| continue; |
| } |
| } |
| } |
| DecoderLifeCycle::AtUtf16LeStart => { |
| debug_assert_eq!(offset, 0usize); |
| if src.is_empty() { |
| return (DecoderResult::InputEmpty, 0, 0); |
| } |
| match src[0] { |
| 0xFFu8 => { |
| self.life_cycle = DecoderLifeCycle::SeenUtf16LeFirst; |
| offset += 1; |
| continue; |
| } |
| _ => { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| continue; |
| } |
| } |
| } |
| DecoderLifeCycle::SeenUtf8First => { |
| if offset >= src.len() { |
| if last { |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| offset, |
| 0xEFu8); |
| } |
| return (DecoderResult::InputEmpty, offset, 0); |
| } |
| if src[offset] == 0xBBu8 { |
| self.life_cycle = DecoderLifeCycle::SeenUtf8Second; |
| offset += 1; |
| continue; |
| } |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| offset, |
| 0xEFu8); |
| } |
| DecoderLifeCycle::SeenUtf8Second => { |
| if offset >= src.len() { |
| if last { |
| return self.$decode_to_utf_after_two_potential_bom_bytes(src, |
| dst, |
| last, |
| offset); |
| } |
| return (DecoderResult::InputEmpty, offset, 0); |
| } |
| if src[offset] == 0xBFu8 { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| offset += 1; |
| if self.encoding != UTF_8 { |
| self.encoding = UTF_8; |
| self.variant = UTF_8.new_variant_decoder(); |
| } |
| return self.$decode_to_utf_checking_end_with_offset(src, |
| dst, |
| last, |
| offset); |
| } |
| return self.$decode_to_utf_after_two_potential_bom_bytes(src, |
| dst, |
| last, |
| offset); |
| } |
| DecoderLifeCycle::SeenUtf16BeFirst => { |
| if offset >= src.len() { |
| if last { |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| offset, |
| 0xFEu8); |
| } |
| return (DecoderResult::InputEmpty, offset, 0); |
| } |
| if src[offset] == 0xFFu8 { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| offset += 1; |
| if self.encoding != UTF_16BE { |
| self.encoding = UTF_16BE; |
| self.variant = UTF_16BE.new_variant_decoder(); |
| } |
| return self.$decode_to_utf_checking_end_with_offset(src, |
| dst, |
| last, |
| offset); |
| } |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| offset, |
| 0xFEu8); |
| } |
| DecoderLifeCycle::SeenUtf16LeFirst => { |
| if offset >= src.len() { |
| if last { |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| offset, |
| 0xFFu8); |
| } |
| return (DecoderResult::InputEmpty, offset, 0); |
| } |
| if src[offset] == 0xFEu8 { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| offset += 1; |
| if self.encoding != UTF_16LE { |
| self.encoding = UTF_16LE; |
| self.variant = UTF_16LE.new_variant_decoder(); |
| } |
| return self.$decode_to_utf_checking_end_with_offset(src, |
| dst, |
| last, |
| offset); |
| } |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| offset, |
| 0xFFu8); |
| } |
| DecoderLifeCycle::ConvertingWithPendingBB => { |
| debug_assert_eq!(offset, 0usize); |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| 0usize, |
| 0xBBu8); |
| } |
| DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."), |
| } |
| } |
| } |
| |
| fn $decode_to_utf_after_one_potential_bom_byte(&mut self, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool, |
| offset: usize, |
| first_byte: u8) |
| -> (DecoderResult, usize, usize) { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| if offset == 0usize { |
| // First byte was seen previously. |
| let first = [first_byte]; |
| let mut out_read = 0usize; |
| let (mut first_result, _, mut first_written) = |
| self.variant |
| .$decode_to_utf_raw(&first[..], dst, false); |
| match first_result { |
| DecoderResult::InputEmpty => { |
| let (result, read, written) = |
| self.$decode_to_utf_checking_end(src, &mut dst[first_written..], last); |
| first_result = result; |
| out_read = read; // Overwrite, don't add! |
| first_written += written; |
| } |
| DecoderResult::Malformed(_, _) => { |
| // Wasn't read from `src`!, leave out_read to 0 |
| } |
| DecoderResult::OutputFull => { |
| panic!("Output buffer must have been too small."); |
| } |
| } |
| return (first_result, out_read, first_written); |
| } |
| debug_assert_eq!(offset, 1usize); |
| // The first byte is in `src`, so no need to push it separately. |
| self.$decode_to_utf_checking_end(src, dst, last) |
| } |
| |
| fn $decode_to_utf_after_two_potential_bom_bytes(&mut self, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool, |
| offset: usize) |
| -> (DecoderResult, usize, usize) { |
| self.life_cycle = DecoderLifeCycle::Converting; |
| if offset == 0usize { |
| // The first two bytes are not in the current buffer.. |
| let ef_bb = [0xEFu8, 0xBBu8]; |
| let (mut first_result, mut first_read, mut first_written) = |
| self.variant |
| .$decode_to_utf_raw(&ef_bb[..], dst, false); |
| match first_result { |
| DecoderResult::InputEmpty => { |
| let (result, read, written) = |
| self.$decode_to_utf_checking_end(src, &mut dst[first_written..], last); |
| first_result = result; |
| first_read = read; // Overwrite, don't add! |
| first_written += written; |
| } |
| DecoderResult::Malformed(_, _) => { |
| if first_read == 1usize { |
| // The first byte was malformed. We need to handle |
| // the second one, which isn't in `src`, later. |
| self.life_cycle = DecoderLifeCycle::ConvertingWithPendingBB; |
| } |
| first_read = 0usize; // Wasn't read from `src`! |
| } |
| DecoderResult::OutputFull => { |
| panic!("Output buffer must have been too small."); |
| } |
| } |
| return (first_result, first_read, first_written); |
| } |
| if offset == 1usize { |
| // The first byte isn't in the current buffer but the second one |
| // is. |
| return self.$decode_to_utf_after_one_potential_bom_byte(src, |
| dst, |
| last, |
| 0usize, |
| 0xEFu8); |
| |
| } |
| debug_assert_eq!(offset, 2usize); |
| // The first two bytes are in `src`, so no need to push them separately. |
| self.$decode_to_utf_checking_end(src, dst, last) |
| } |
| |
| /// Calls `$decode_to_utf_checking_end` with `offset` bytes omitted from |
| /// the start of `src` but adjusting the return values to show those bytes |
| /// as having been consumed. |
| fn $decode_to_utf_checking_end_with_offset(&mut self, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool, |
| offset: usize) |
| -> (DecoderResult, usize, usize) { |
| debug_assert_eq!(self.life_cycle, DecoderLifeCycle::Converting); |
| let (result, read, written) = self.$decode_to_utf_checking_end(&src[offset..], dst, last); |
| (result, read + offset, written) |
| } |
| |
| /// Calls through to the delegate and adjusts life cycle iff `last` is |
| /// `true` and result is `DecoderResult::InputEmpty`. |
| fn $decode_to_utf_checking_end(&mut self, |
| src: &[u8], |
| dst: &mut [$code_unit], |
| last: bool) |
| -> (DecoderResult, usize, usize) { |
| debug_assert_eq!(self.life_cycle, DecoderLifeCycle::Converting); |
| let (result, read, written) = self.variant |
| .$decode_to_utf_raw(src, dst, last); |
| if last { |
| if let DecoderResult::InputEmpty = result { |
| self.life_cycle = DecoderLifeCycle::Finished; |
| } |
| } |
| (result, read, written) |
| }); |
| } |