| // Copyright 2014-2017 The html5ever Project Developers. See the |
| // COPYRIGHT file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| //! The `BufferQueue` struct and helper types. |
| //! |
| //! This type is designed for the efficient parsing of string data, especially where many |
| //! significant characters are from the ascii range 0-63. This includes, for example, important |
| //! characters in xml/html parsing. |
| //! |
| //! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero |
| //! copy). |
| //! |
| //! [`BufferQueue`]: struct.BufferQueue.html |
| |
| use std::collections::VecDeque; |
| |
| use tendril::StrTendril; |
| |
| pub use self::SetResult::{FromSet, NotFromSet}; |
| use util::smallcharset::SmallCharSet; |
| |
| /// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a |
| /// string buffer of characters not from the set. |
| /// |
| /// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from |
| /// [`SmallCharSet`]: ../struct.SmallCharSet.html |
| #[derive(PartialEq, Eq, Debug)] |
| pub enum SetResult { |
| /// A character from the `SmallCharSet`. |
| FromSet(char), |
| /// A string buffer containing no characters from the `SmallCharSet`. |
| NotFromSet(StrTendril), |
| } |
| |
| /// A queue of owned string buffers, which supports incrementally consuming characters. |
| /// |
| /// Internally it uses [`VecDeque`] and has the same complexity properties. |
| /// |
| /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html |
| #[derive(Debug)] |
| pub struct BufferQueue { |
| /// Buffers to process. |
| buffers: VecDeque<StrTendril>, |
| } |
| |
| impl BufferQueue { |
| /// Create an empty BufferQueue. |
| #[inline] |
| pub fn new() -> BufferQueue { |
| BufferQueue { |
| buffers: VecDeque::with_capacity(16), |
| } |
| } |
| |
| /// Returns whether the queue is empty. |
| #[inline] |
| pub fn is_empty(&self) -> bool { |
| self.buffers.is_empty() |
| } |
| |
| /// Get the buffer at the beginning of the queue. |
| #[inline] |
| pub fn pop_front(&mut self) -> Option<StrTendril> { |
| self.buffers.pop_front() |
| } |
| |
| /// Add a buffer to the beginning of the queue. |
| /// |
| /// If the buffer is empty, it will be skipped. |
| pub fn push_front(&mut self, buf: StrTendril) { |
| if buf.len32() == 0 { |
| return; |
| } |
| self.buffers.push_front(buf); |
| } |
| |
| /// Add a buffer to the end of the queue. |
| /// |
| /// If the buffer is empty, it will be skipped. |
| pub fn push_back(&mut self, buf: StrTendril) { |
| if buf.len32() == 0 { |
| return; |
| } |
| self.buffers.push_back(buf); |
| } |
| |
| /// Look at the next available character without removing it, if the queue is not empty. |
| pub fn peek(&self) -> Option<char> { |
| debug_assert!( |
| self.buffers |
| .iter() |
| .skip_while(|el| el.len32() != 0) |
| .next() |
| .is_none(), |
| "invariant \"all buffers in the queue are non-empty\" failed" |
| ); |
| self.buffers.front().map(|b| b.chars().next().unwrap()) |
| } |
| |
| /// Get the next character if one is available, removing it from the queue. |
| /// |
| /// This function manages the buffers, removing them as they become empty. |
| pub fn next(&mut self) -> Option<char> { |
| let (result, now_empty) = match self.buffers.front_mut() { |
| None => (None, false), |
| Some(buf) => { |
| let c = buf.pop_front_char().expect("empty buffer in queue"); |
| (Some(c), buf.is_empty()) |
| }, |
| }; |
| |
| if now_empty { |
| self.buffers.pop_front(); |
| } |
| |
| result |
| } |
| |
| /// Pops and returns either a single character from the given set, or |
| /// a buffer of characters none of which are in the set. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// # #[macro_use] extern crate markup5ever; |
| /// # #[macro_use] extern crate tendril; |
| /// # fn main() { |
| /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; |
| /// |
| /// let mut queue = BufferQueue::new(); |
| /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#)); |
| /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/'); |
| /// let tag = format_tendril!("some_tag"); |
| /// let attr = format_tendril!("attr"); |
| /// let attr_val = format_tendril!("text"); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<'))); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' '))); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('='))); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"'))); |
| /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); |
| /// // ... |
| /// # } |
| /// ``` |
| pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { |
| let (result, now_empty) = match self.buffers.front_mut() { |
| None => (None, false), |
| Some(buf) => { |
| let n = set.nonmember_prefix_len(&buf); |
| if n > 0 { |
| let out; |
| unsafe { |
| out = buf.unsafe_subtendril(0, n); |
| buf.unsafe_pop_front(n); |
| } |
| (Some(NotFromSet(out)), buf.is_empty()) |
| } else { |
| let c = buf.pop_front_char().expect("empty buffer in queue"); |
| (Some(FromSet(c)), buf.is_empty()) |
| } |
| }, |
| }; |
| |
| // Unborrow self for this part. |
| if now_empty { |
| self.buffers.pop_front(); |
| } |
| |
| result |
| } |
| |
| /// Consume bytes matching the pattern, using a custom comparison function `eq`. |
| /// |
| /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if |
| /// it wasn't possible to know (more data is needed). |
| /// |
| /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// # extern crate markup5ever; |
| /// # #[macro_use] extern crate tendril; |
| /// # fn main() { |
| /// use markup5ever::buffer_queue::{BufferQueue}; |
| /// |
| /// let mut queue = BufferQueue::new(); |
| /// queue.push_back(format_tendril!("testtext")); |
| /// let test_str = "test"; |
| /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true)); |
| /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true)); |
| /// assert!(queue.is_empty()); |
| /// # } |
| /// ``` |
| pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> { |
| let mut buffers_exhausted = 0; |
| let mut consumed_from_last = 0; |
| if self.buffers.front().is_none() { |
| return None; |
| } |
| |
| for pattern_byte in pat.bytes() { |
| if buffers_exhausted >= self.buffers.len() { |
| return None; |
| } |
| let ref buf = self.buffers[buffers_exhausted]; |
| |
| if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { |
| return Some(false); |
| } |
| |
| consumed_from_last += 1; |
| if consumed_from_last >= buf.len() { |
| buffers_exhausted += 1; |
| consumed_from_last = 0; |
| } |
| } |
| |
| // We have a match. Commit changes to the BufferQueue. |
| for _ in 0..buffers_exhausted { |
| self.buffers.pop_front(); |
| } |
| |
| match self.buffers.front_mut() { |
| None => assert_eq!(consumed_from_last, 0), |
| Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), |
| } |
| |
| Some(true) |
| } |
| } |
| |
| #[cfg(test)] |
| #[allow(non_snake_case)] |
| mod test { |
| use tendril::SliceExt; |
| |
| use super::BufferQueue; |
| use super::SetResult::{FromSet, NotFromSet}; |
| |
| #[test] |
| fn smoke_test() { |
| let mut bq = BufferQueue::new(); |
| assert_eq!(bq.peek(), None); |
| assert_eq!(bq.next(), None); |
| |
| bq.push_back("abc".to_tendril()); |
| assert_eq!(bq.peek(), Some('a')); |
| assert_eq!(bq.next(), Some('a')); |
| assert_eq!(bq.peek(), Some('b')); |
| assert_eq!(bq.peek(), Some('b')); |
| assert_eq!(bq.next(), Some('b')); |
| assert_eq!(bq.peek(), Some('c')); |
| assert_eq!(bq.next(), Some('c')); |
| assert_eq!(bq.peek(), None); |
| assert_eq!(bq.next(), None); |
| } |
| |
| #[test] |
| fn can_unconsume() { |
| let mut bq = BufferQueue::new(); |
| bq.push_back("abc".to_tendril()); |
| assert_eq!(bq.next(), Some('a')); |
| |
| bq.push_front("xy".to_tendril()); |
| assert_eq!(bq.next(), Some('x')); |
| assert_eq!(bq.next(), Some('y')); |
| assert_eq!(bq.next(), Some('b')); |
| assert_eq!(bq.next(), Some('c')); |
| assert_eq!(bq.next(), None); |
| } |
| |
| #[test] |
| fn can_pop_except_set() { |
| let mut bq = BufferQueue::new(); |
| bq.push_back("abc&def".to_tendril()); |
| let mut pop = || bq.pop_except_from(small_char_set!('&')); |
| assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); |
| assert_eq!(pop(), Some(FromSet('&'))); |
| assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); |
| assert_eq!(pop(), None); |
| } |
| |
| #[test] |
| fn can_eat() { |
| // This is not very comprehensive. We rely on the tokenizer |
| // integration tests for more thorough testing with many |
| // different input buffer splits. |
| let mut bq = BufferQueue::new(); |
| bq.push_back("a".to_tendril()); |
| bq.push_back("bc".to_tendril()); |
| assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); |
| assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); |
| assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); |
| assert_eq!(bq.next(), Some('c')); |
| assert_eq!(bq.next(), None); |
| } |
| } |