| // Copyright 2015 Google Inc. All rights reserved. | |
| // | |
| // Permission is hereby granted, free of charge, to any person obtaining a copy | |
| // of this software and associated documentation files (the "Software"), to deal | |
| // in the Software without restriction, including without limitation the rights | |
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| // copies of the Software, and to permit persons to whom the Software is | |
| // furnished to do so, subject to the following conditions: | |
| // | |
| // The above copyright notice and this permission notice shall be included in | |
| // all copies or substantial portions of the Software. | |
| // | |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| // THE SOFTWARE. | |
| //! Utility functions for HTML escaping | |
| use std::io; | |
| use std::str::from_utf8; | |
| use crate::html::StrWrite; | |
| #[rustfmt::skip] | |
| static HREF_SAFE: [u8; 128] = [ | |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, | |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, | |
| 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, | |
| ]; | |
| static HEX_CHARS: &[u8] = b"0123456789ABCDEF"; | |
| static AMP_ESCAPE: &str = "&"; | |
| static SLASH_ESCAPE: &str = "'"; | |
| pub(crate) fn escape_href<W>(mut w: W, s: &str) -> io::Result<()> | |
| where | |
| W: StrWrite, | |
| { | |
| let bytes = s.as_bytes(); | |
| let mut mark = 0; | |
| for i in 0..bytes.len() { | |
| let c = bytes[i]; | |
| if c >= 0x80 || HREF_SAFE[c as usize] == 0 { | |
| // character needing escape | |
| // write partial substring up to mark | |
| if mark < i { | |
| w.write_str(&s[mark..i])?; | |
| } | |
| match c { | |
| b'&' => { | |
| w.write_str(AMP_ESCAPE)?; | |
| } | |
| b'\'' => { | |
| w.write_str(SLASH_ESCAPE)?; | |
| } | |
| _ => { | |
| let mut buf = [0u8; 3]; | |
| buf[0] = b'%'; | |
| buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF]; | |
| buf[2] = HEX_CHARS[(c as usize) & 0xF]; | |
| let escaped = from_utf8(&buf).unwrap(); | |
| w.write_str(escaped)?; | |
| } | |
| } | |
| mark = i + 1; // all escaped characters are ASCII | |
| } | |
| } | |
| w.write_str(&s[mark..]) | |
| } | |
| const fn create_html_escape_table() -> [u8; 256] { | |
| let mut table = [0; 256]; | |
| table[b'"' as usize] = 1; | |
| table[b'&' as usize] = 2; | |
| table[b'<' as usize] = 3; | |
| table[b'>' as usize] = 4; | |
| table | |
| } | |
| static HTML_ESCAPE_TABLE: [u8; 256] = create_html_escape_table(); | |
| static HTML_ESCAPES: [&'static str; 5] = ["", """, "&", "<", ">"]; | |
| /// Writes the given string to the Write sink, replacing special HTML bytes | |
| /// (<, >, &, ") by escape sequences. | |
| pub(crate) fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> { | |
| #[cfg(all(target_arch = "x86_64", feature = "simd"))] | |
| { | |
| simd::escape_html(w, s) | |
| } | |
| #[cfg(not(all(target_arch = "x86_64", feature = "simd")))] | |
| { | |
| escape_html_scalar(w, s) | |
| } | |
| } | |
| fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> { | |
| let bytes = s.as_bytes(); | |
| let mut mark = 0; | |
| let mut i = 0; | |
| while i < s.len() { | |
| match bytes[i..] | |
| .iter() | |
| .position(|&c| HTML_ESCAPE_TABLE[c as usize] != 0) | |
| { | |
| Some(pos) => { | |
| i += pos; | |
| } | |
| None => break, | |
| } | |
| let c = bytes[i]; | |
| let escape = HTML_ESCAPE_TABLE[c as usize]; | |
| let escape_seq = HTML_ESCAPES[escape as usize]; | |
| w.write_str(&s[mark..i])?; | |
| w.write_str(escape_seq)?; | |
| i += 1; | |
| mark = i; // all escaped characters are ASCII | |
| } | |
| w.write_str(&s[mark..]) | |
| } | |
| #[cfg(all(target_arch = "x86_64", feature = "simd"))] | |
| mod simd { | |
| use crate::html::StrWrite; | |
| use std::arch::x86_64::*; | |
| use std::io; | |
| use std::mem::size_of; | |
| const VECTOR_SIZE: usize = size_of::<__m128i>(); | |
| pub(crate) fn escape_html<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> { | |
| // The SIMD accelerated code uses the PSHUFB instruction, which is part | |
| // of the SSSE3 instruction set. Further, we can only use this code if | |
| // the buffer is at least one VECTOR_SIZE in length to prevent reading | |
| // out of bounds. If either of these conditions is not met, we fall back | |
| // to scalar code. | |
| if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE { | |
| let bytes = s.as_bytes(); | |
| let mut mark = 0; | |
| unsafe { | |
| foreach_special_simd(bytes, 0, |i| { | |
| let escape_ix = *bytes.get_unchecked(i) as usize; | |
| let replacement = | |
| super::HTML_ESCAPES[super::HTML_ESCAPE_TABLE[escape_ix] as usize]; | |
| w.write_str(&s.get_unchecked(mark..i))?; | |
| mark = i + 1; // all escaped characters are ASCII | |
| w.write_str(replacement) | |
| })?; | |
| w.write_str(&s.get_unchecked(mark..)) | |
| } | |
| } else { | |
| super::escape_html_scalar(w, s) | |
| } | |
| } | |
| /// Creates the lookup table for use in `compute_mask`. | |
| const fn create_lookup() -> [u8; 16] { | |
| let mut table = [0; 16]; | |
| table[(b'<' & 0x0f) as usize] = b'<'; | |
| table[(b'>' & 0x0f) as usize] = b'>'; | |
| table[(b'&' & 0x0f) as usize] = b'&'; | |
| table[(b'"' & 0x0f) as usize] = b'"'; | |
| table[0] = 0b0111_1111; | |
| table | |
| } | |
| #[target_feature(enable = "ssse3")] | |
| /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant) | |
| /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes | |
| /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte | |
| /// at `offset + 3`. It is only safe to call this function when | |
| /// `bytes.len() >= offset + VECTOR_SIZE`. | |
| unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 { | |
| debug_assert!(bytes.len() >= offset + VECTOR_SIZE); | |
| let table = create_lookup(); | |
| let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i); | |
| let raw_ptr = bytes.as_ptr().offset(offset as isize) as *const __m128i; | |
| // Load the vector from memory. | |
| let vector = _mm_loadu_si128(raw_ptr); | |
| // We take the least significant 4 bits of every byte and use them as indices | |
| // to map into the lookup vector. | |
| // Note that shuffle maps bytes with their most significant bit set to lookup[0]. | |
| // Bytes that share their lower nibble with an HTML special byte get mapped to that | |
| // corresponding special byte. Note that all HTML special bytes have distinct lower | |
| // nibbles. Other bytes either get mapped to 0 or 127. | |
| let expected = _mm_shuffle_epi8(lookup, vector); | |
| // We compare the original vector to the mapped output. Bytes that shared a lower | |
| // nibble with an HTML special byte match *only* if they are that special byte. Bytes | |
| // that have either a 0 lower nibble or their most significant bit set were mapped to | |
| // 127 and will hence never match. All other bytes have non-zero lower nibbles but | |
| // were mapped to 0 and will therefore also not match. | |
| let matches = _mm_cmpeq_epi8(expected, vector); | |
| // Translate matches to a bitmask, where every 1 corresponds to a HTML special character | |
| // and a 0 is a non-HTML byte. | |
| _mm_movemask_epi8(matches) | |
| } | |
| /// Calls the given function with the index of every byte in the given byteslice | |
| /// that is either ", &, <, or > and for no other byte. | |
| /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may | |
| /// occur otherwise. | |
| #[target_feature(enable = "ssse3")] | |
| unsafe fn foreach_special_simd<F>( | |
| bytes: &[u8], | |
| mut offset: usize, | |
| mut callback: F, | |
| ) -> io::Result<()> | |
| where | |
| F: FnMut(usize) -> io::Result<()>, | |
| { | |
| // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16) | |
| // bytes at a time starting at the given offset. For each chunk, we compute a | |
| // a bitmask indicating whether the corresponding byte is a HTML special byte. | |
| // We then iterate over all the 1 bits in this mask and call the callback function | |
| // with the corresponding index in the buffer. | |
| // When the number of HTML special bytes in the buffer is relatively low, this | |
| // allows us to quickly go through the buffer without a lookup and for every | |
| // single byte. | |
| debug_assert!(bytes.len() >= VECTOR_SIZE); | |
| let upperbound = bytes.len() - VECTOR_SIZE; | |
| while offset < upperbound { | |
| let mut mask = compute_mask(bytes, offset); | |
| while mask != 0 { | |
| let ix = mask.trailing_zeros(); | |
| callback(offset + ix as usize)?; | |
| mask ^= mask & -mask; | |
| } | |
| offset += VECTOR_SIZE; | |
| } | |
| // Final iteration. We align the read with the end of the slice and | |
| // shift off the bytes at start we have already scanned. | |
| let mut mask = compute_mask(bytes, upperbound); | |
| mask >>= offset - upperbound; | |
| while mask != 0 { | |
| let ix = mask.trailing_zeros(); | |
| callback(offset + ix as usize)?; | |
| mask ^= mask & -mask; | |
| } | |
| Ok(()) | |
| } | |
| #[cfg(test)] | |
| mod html_scan_tests { | |
| #[test] | |
| fn multichunk() { | |
| let mut vec = Vec::new(); | |
| unsafe { | |
| super::foreach_special_simd("&aXaaaa.a'aa9a<>aab&".as_bytes(), 0, |ix| { | |
| Ok(vec.push(ix)) | |
| }) | |
| .unwrap(); | |
| } | |
| assert_eq!(vec, vec![0, 14, 15, 19]); | |
| } | |
| // only match these bytes, and when we match them, match them VECTOR_SIZE times | |
| #[test] | |
| fn only_right_bytes_matched() { | |
| for b in 0..255u8 { | |
| let right_byte = b == b'&' || b == b'<' || b == b'>' || b == b'"'; | |
| let vek = vec![b; super::VECTOR_SIZE]; | |
| let mut match_count = 0; | |
| unsafe { | |
| super::foreach_special_simd(&vek, 0, |_| { | |
| match_count += 1; | |
| Ok(()) | |
| }) | |
| .unwrap(); | |
| } | |
| assert!((match_count > 0) == (match_count == super::VECTOR_SIZE)); | |
| assert_eq!( | |
| (match_count == super::VECTOR_SIZE), | |
| right_byte, | |
| "match_count: {}, byte: {:?}", | |
| match_count, | |
| b as char | |
| ); | |
| } | |
| } | |
| } | |
| } |