| // Copyright 2014-2017 The html5ever Project Developers. See the |
| // COPYRIGHT file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| //! The HTML5 tokenizer. |
| |
| pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; |
| pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; |
| pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; |
| pub use self::interface::{TokenSink, TokenSinkResult}; |
| |
| use self::states::{DoctypeIdKind, Public, System}; |
| use self::states::{DoubleEscaped, Escaped}; |
| use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; |
| use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; |
| |
| use self::char_ref::{CharRef, CharRefTokenizer}; |
| |
| use crate::util::str::lower_ascii_letter; |
| |
| use log::{debug, trace}; |
| use mac::{_tt_as_expr_hack, format_if, matches}; |
| use markup5ever::{namespace_url, ns, small_char_set}; |
| use std::borrow::Cow::{self, Borrowed}; |
| use std::collections::BTreeMap; |
| use std::default::Default; |
| use std::mem::replace; |
| |
| pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; |
| use crate::tendril::StrTendril; |
| use crate::{Attribute, LocalName, QualName, SmallCharSet}; |
| |
| mod char_ref; |
| mod interface; |
| pub mod states; |
| |
| pub enum ProcessResult<Handle> { |
| Continue, |
| Suspend, |
| Script(Handle), |
| } |
| |
| #[must_use] |
| pub enum TokenizerResult<Handle> { |
| Done, |
| Script(Handle), |
| } |
| |
| fn option_push(opt_str: &mut Option<StrTendril>, c: char) { |
| match *opt_str { |
| Some(ref mut s) => s.push_char(c), |
| None => *opt_str = Some(StrTendril::from_char(c)), |
| } |
| } |
| |
| /// Tokenizer options, with an impl for `Default`. |
| #[derive(Clone)] |
| pub struct TokenizerOpts { |
| /// Report all parse errors described in the spec, at some |
| /// performance penalty? Default: false |
| pub exact_errors: bool, |
| |
| /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning |
| /// of the stream? Default: true |
| pub discard_bom: bool, |
| |
| /// Keep a record of how long we spent in each state? Printed |
| /// when `end()` is called. Default: false |
| pub profile: bool, |
| |
| /// Initial state override. Only the test runner should use |
| /// a non-`None` value! |
| pub initial_state: Option<states::State>, |
| |
| /// Last start tag. Only the test runner should use a |
| /// non-`None` value! |
| /// |
| /// FIXME: Can't use Tendril because we want TokenizerOpts |
| /// to be Send. |
| pub last_start_tag_name: Option<String>, |
| } |
| |
| impl Default for TokenizerOpts { |
| fn default() -> TokenizerOpts { |
| TokenizerOpts { |
| exact_errors: false, |
| discard_bom: true, |
| profile: false, |
| initial_state: None, |
| last_start_tag_name: None, |
| } |
| } |
| } |
| |
| /// The HTML tokenizer. |
| pub struct Tokenizer<Sink> { |
| /// Options controlling the behavior of the tokenizer. |
| opts: TokenizerOpts, |
| |
| /// Destination for tokens we emit. |
| pub sink: Sink, |
| |
| /// The abstract machine state as described in the spec. |
| state: states::State, |
| |
| /// Are we at the end of the file, once buffers have been processed |
| /// completely? This affects whether we will wait for lookahead or not. |
| at_eof: bool, |
| |
| /// Tokenizer for character references, if we're tokenizing |
| /// one at the moment. |
| char_ref_tokenizer: Option<Box<CharRefTokenizer>>, |
| |
| /// Current input character. Just consumed, may reconsume. |
| current_char: char, |
| |
| /// Should we reconsume the current input character? |
| reconsume: bool, |
| |
| /// Did we just consume \r, translating it to \n? In that case we need |
| /// to ignore the next character if it's \n. |
| ignore_lf: bool, |
| |
| /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the |
| /// beginning of the stream. |
| discard_bom: bool, |
| |
| /// Current tag kind. |
| current_tag_kind: TagKind, |
| |
| /// Current tag name. |
| current_tag_name: StrTendril, |
| |
| /// Current tag is self-closing? |
| current_tag_self_closing: bool, |
| |
| /// Current tag attributes. |
| current_tag_attrs: Vec<Attribute>, |
| |
| /// Current attribute name. |
| current_attr_name: StrTendril, |
| |
| /// Current attribute value. |
| current_attr_value: StrTendril, |
| |
| /// Current comment. |
| current_comment: StrTendril, |
| |
| /// Current doctype token. |
| current_doctype: Doctype, |
| |
| /// Last start tag name, for use in checking "appropriate end tag". |
| last_start_tag_name: Option<LocalName>, |
| |
| /// The "temporary buffer" mentioned in the spec. |
| temp_buf: StrTendril, |
| |
| /// Record of how many ns we spent in each state, if profiling is enabled. |
| state_profile: BTreeMap<states::State, u64>, |
| |
| /// Record of how many ns we spent in the token sink. |
| time_in_sink: u64, |
| |
| /// Track current line |
| current_line: u64, |
| } |
| |
| impl<Sink: TokenSink> Tokenizer<Sink> { |
| /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. |
| pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> { |
| let start_tag_name = opts |
| .last_start_tag_name |
| .take() |
| .map(|s| LocalName::from(&*s)); |
| let state = opts.initial_state.unwrap_or(states::Data); |
| let discard_bom = opts.discard_bom; |
| Tokenizer { |
| opts, |
| sink, |
| state, |
| char_ref_tokenizer: None, |
| at_eof: false, |
| current_char: '\0', |
| reconsume: false, |
| ignore_lf: false, |
| discard_bom, |
| current_tag_kind: StartTag, |
| current_tag_name: StrTendril::new(), |
| current_tag_self_closing: false, |
| current_tag_attrs: vec![], |
| current_attr_name: StrTendril::new(), |
| current_attr_value: StrTendril::new(), |
| current_comment: StrTendril::new(), |
| current_doctype: Doctype::new(), |
| last_start_tag_name: start_tag_name, |
| temp_buf: StrTendril::new(), |
| state_profile: BTreeMap::new(), |
| time_in_sink: 0, |
| current_line: 1, |
| } |
| } |
| |
| /// Feed an input string into the tokenizer. |
| pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> { |
| if input.is_empty() { |
| return TokenizerResult::Done; |
| } |
| |
| if self.discard_bom { |
| if let Some(c) = input.peek() { |
| if c == '\u{feff}' { |
| input.next(); |
| } |
| } else { |
| return TokenizerResult::Done; |
| } |
| }; |
| |
| self.run(input) |
| } |
| |
| pub fn set_plaintext_state(&mut self) { |
| self.state = states::Plaintext; |
| } |
| |
| fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> { |
| if self.opts.profile { |
| let (ret, dt) = time!(self.sink.process_token(token, self.current_line)); |
| self.time_in_sink += dt; |
| ret |
| } else { |
| self.sink.process_token(token, self.current_line) |
| } |
| } |
| |
| fn process_token_and_continue(&mut self, token: Token) { |
| assert!(matches!( |
| self.process_token(token), |
| TokenSinkResult::Continue |
| )); |
| } |
| |
| //§ preprocessing-the-input-stream |
| // Get the next input character, which might be the character |
| // 'c' that we already consumed from the buffers. |
| fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> { |
| if self.ignore_lf { |
| self.ignore_lf = false; |
| if c == '\n' { |
| c = unwrap_or_return!(input.next(), None); |
| } |
| } |
| |
| if c == '\r' { |
| self.ignore_lf = true; |
| c = '\n'; |
| } |
| |
| if c == '\n' { |
| self.current_line += 1; |
| } |
| |
| if self.opts.exact_errors && |
| match c as u32 { |
| 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, |
| n if (n & 0xFFFE) == 0xFFFE => true, |
| _ => false, |
| } |
| { |
| let msg = format!("Bad character {}", c); |
| self.emit_error(Cow::Owned(msg)); |
| } |
| |
| trace!("got character {}", c); |
| self.current_char = c; |
| Some(c) |
| } |
| |
| //§ tokenization |
| // Get the next input character, if one is available. |
| fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> { |
| if self.reconsume { |
| self.reconsume = false; |
| Some(self.current_char) |
| } else { |
| input |
| .next() |
| .and_then(|c| self.get_preprocessed_char(c, input)) |
| } |
| } |
| |
| fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> { |
| // Bail to the slow path for various corner cases. |
| // This means that `FromSet` can contain characters not in the set! |
| // It shouldn't matter because the fallback `FromSet` case should |
| // always do the same thing as the `NotFromSet` case. |
| if self.opts.exact_errors || self.reconsume || self.ignore_lf { |
| return self.get_char(input).map(FromSet); |
| } |
| |
| let d = input.pop_except_from(set); |
| trace!("got characters {:?}", d); |
| match d { |
| Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet), |
| |
| // NB: We don't set self.current_char for a run of characters not |
| // in the set. It shouldn't matter for the codepaths that use |
| // this. |
| _ => d, |
| } |
| } |
| |
| // Check if the next characters are an ASCII case-insensitive match. See |
| // BufferQueue::eat. |
| // |
| // NB: this doesn't do input stream preprocessing or set the current input |
| // character. |
| fn eat( |
| &mut self, |
| input: &mut BufferQueue, |
| pat: &str, |
| eq: fn(&u8, &u8) -> bool, |
| ) -> Option<bool> { |
| input.push_front(replace(&mut self.temp_buf, StrTendril::new())); |
| match input.eat(pat, eq) { |
| None if self.at_eof => Some(false), |
| None => { |
| while let Some(c) = input.next() { |
| self.temp_buf.push_char(c); |
| } |
| None |
| }, |
| Some(matched) => Some(matched), |
| } |
| } |
| |
| /// Run the state machine for as long as we can. |
| fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> { |
| if self.opts.profile { |
| loop { |
| let state = self.state; |
| let old_sink = self.time_in_sink; |
| let (run, mut dt) = time!(self.step(input)); |
| dt -= (self.time_in_sink - old_sink); |
| let new = match self.state_profile.get_mut(&state) { |
| Some(x) => { |
| *x += dt; |
| false |
| }, |
| None => true, |
| }; |
| if new { |
| // do this here because of borrow shenanigans |
| self.state_profile.insert(state, dt); |
| } |
| match run { |
| ProcessResult::Continue => (), |
| ProcessResult::Suspend => break, |
| ProcessResult::Script(node) => return TokenizerResult::Script(node), |
| } |
| } |
| } else { |
| loop { |
| match self.step(input) { |
| ProcessResult::Continue => (), |
| ProcessResult::Suspend => break, |
| ProcessResult::Script(node) => return TokenizerResult::Script(node), |
| } |
| } |
| } |
| TokenizerResult::Done |
| } |
| |
| fn bad_char_error(&mut self) { |
| let msg = format_if!( |
| self.opts.exact_errors, |
| "Bad character", |
| "Saw {} in state {:?}", |
| self.current_char, |
| self.state |
| ); |
| self.emit_error(msg); |
| } |
| |
| fn bad_eof_error(&mut self) { |
| let msg = format_if!( |
| self.opts.exact_errors, |
| "Unexpected EOF", |
| "Saw EOF in state {:?}", |
| self.state |
| ); |
| self.emit_error(msg); |
| } |
| |
| fn emit_char(&mut self, c: char) { |
| self.process_token_and_continue(match c { |
| '\0' => NullCharacterToken, |
| _ => CharacterTokens(StrTendril::from_char(c)), |
| }); |
| } |
| |
| // The string must not contain '\0'! |
| fn emit_chars(&mut self, b: StrTendril) { |
| self.process_token_and_continue(CharacterTokens(b)); |
| } |
| |
| fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> { |
| self.finish_attribute(); |
| |
| let name = LocalName::from(&*self.current_tag_name); |
| self.current_tag_name.clear(); |
| |
| match self.current_tag_kind { |
| StartTag => { |
| self.last_start_tag_name = Some(name.clone()); |
| }, |
| EndTag => { |
| if !self.current_tag_attrs.is_empty() { |
| self.emit_error(Borrowed("Attributes on an end tag")); |
| } |
| if self.current_tag_self_closing { |
| self.emit_error(Borrowed("Self-closing end tag")); |
| } |
| }, |
| } |
| |
| let token = TagToken(Tag { |
| kind: self.current_tag_kind, |
| name, |
| self_closing: self.current_tag_self_closing, |
| attrs: replace(&mut self.current_tag_attrs, vec![]), |
| }); |
| |
| match self.process_token(token) { |
| TokenSinkResult::Continue => ProcessResult::Continue, |
| TokenSinkResult::Plaintext => { |
| self.state = states::Plaintext; |
| ProcessResult::Continue |
| }, |
| TokenSinkResult::Script(node) => { |
| self.state = states::Data; |
| ProcessResult::Script(node) |
| }, |
| TokenSinkResult::RawData(kind) => { |
| self.state = states::RawData(kind); |
| ProcessResult::Continue |
| }, |
| } |
| } |
| |
| fn emit_temp_buf(&mut self) { |
| // FIXME: Make sure that clearing on emit is spec-compatible. |
| let buf = replace(&mut self.temp_buf, StrTendril::new()); |
| self.emit_chars(buf); |
| } |
| |
| fn clear_temp_buf(&mut self) { |
| // Do this without a new allocation. |
| self.temp_buf.clear(); |
| } |
| |
| fn emit_current_comment(&mut self) { |
| let comment = replace(&mut self.current_comment, StrTendril::new()); |
| self.process_token_and_continue(CommentToken(comment)); |
| } |
| |
| fn discard_tag(&mut self) { |
| self.current_tag_name.clear(); |
| self.current_tag_self_closing = false; |
| self.current_tag_attrs = vec![]; |
| } |
| |
| fn create_tag(&mut self, kind: TagKind, c: char) { |
| self.discard_tag(); |
| self.current_tag_name.push_char(c); |
| self.current_tag_kind = kind; |
| } |
| |
| fn have_appropriate_end_tag(&self) -> bool { |
| match self.last_start_tag_name.as_ref() { |
| Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last), |
| None => false, |
| } |
| } |
| |
| fn create_attribute(&mut self, c: char) { |
| self.finish_attribute(); |
| |
| self.current_attr_name.push_char(c); |
| } |
| |
| fn finish_attribute(&mut self) { |
| if self.current_attr_name.is_empty() { |
| return; |
| } |
| |
| // Check for a duplicate attribute. |
| // FIXME: the spec says we should error as soon as the name is finished. |
| // FIXME: linear time search, do we care? |
| let dup = { |
| let name = &*self.current_attr_name; |
| self.current_tag_attrs |
| .iter() |
| .any(|a| &*a.name.local == name) |
| }; |
| |
| if dup { |
| self.emit_error(Borrowed("Duplicate attribute")); |
| self.current_attr_name.clear(); |
| self.current_attr_value.clear(); |
| } else { |
| let name = LocalName::from(&*self.current_attr_name); |
| self.current_attr_name.clear(); |
| self.current_tag_attrs.push(Attribute { |
| // The tree builder will adjust the namespace if necessary. |
| // This only happens in foreign elements. |
| name: QualName::new(None, ns!(), name), |
| value: replace(&mut self.current_attr_value, StrTendril::new()), |
| }); |
| } |
| } |
| |
| fn emit_current_doctype(&mut self) { |
| let doctype = replace(&mut self.current_doctype, Doctype::new()); |
| self.process_token_and_continue(DoctypeToken(doctype)); |
| } |
| |
| fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> { |
| match kind { |
| Public => &mut self.current_doctype.public_id, |
| System => &mut self.current_doctype.system_id, |
| } |
| } |
| |
| fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { |
| let id = self.doctype_id(kind); |
| match *id { |
| Some(ref mut s) => s.clear(), |
| None => *id = Some(StrTendril::new()), |
| } |
| } |
| |
| fn consume_char_ref(&mut self, addnl_allowed: Option<char>) { |
| // NB: The char ref tokenizer assumes we have an additional allowed |
| // character iff we're tokenizing in an attribute value. |
| self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed))); |
| } |
| |
| fn emit_eof(&mut self) { |
| self.process_token_and_continue(EOFToken); |
| } |
| |
| fn peek(&mut self, input: &BufferQueue) -> Option<char> { |
| if self.reconsume { |
| Some(self.current_char) |
| } else { |
| input.peek() |
| } |
| } |
| |
| fn discard_char(&mut self, input: &mut BufferQueue) { |
| self.get_char(input); |
| } |
| |
| fn emit_error(&mut self, error: Cow<'static, str>) { |
| self.process_token_and_continue(ParseError(error)); |
| } |
| } |
| //§ END |
| |
| // Shorthand for common state machine behaviors. |
| macro_rules! shorthand ( |
| ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) ); |
| ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) ); |
| ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c) ); |
| ( $me:ident : discard_tag ) => ( $me.discard_tag() ); |
| ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) ); |
| ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c) ); |
| ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() ); |
| ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() ); |
| ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) ); |
| ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c) ); |
| ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c) ); |
| ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c) ); |
| ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c) ); |
| ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c) ); |
| ( $me:ident : emit_comment ) => ( $me.emit_current_comment() ); |
| ( $me:ident : clear_comment ) => ( $me.current_comment.clear() ); |
| ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new() ); |
| ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c) ); |
| ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c) ); |
| ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) ); |
| ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true ); |
| ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() ); |
| ( $me:ident : error ) => ( $me.bad_char_error() ); |
| ( $me:ident : error_eof ) => ( $me.bad_eof_error() ); |
| ); |
| |
| // Tracing of tokenizer actions. This adds significant bloat and compile time, |
| // so it's behind a cfg flag. |
| #[cfg(trace_tokenizer)] |
| macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ |
| trace!(" {:s}", stringify!($($cmds)*)); |
| shorthand!($me:expr : $($cmds)*); |
| })); |
| |
| #[cfg(not(trace_tokenizer))] |
| macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); |
| |
| // A little DSL for sequencing shorthand actions. |
| macro_rules! go ( |
| // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. |
| // We have to tell the parser how much lookahead we need. |
| |
| ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); }); |
| ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); }); |
| ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); }); |
| ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); |
| |
| // These can only come at the end. |
| |
| ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; }); |
| ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; }); |
| ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; }); |
| |
| ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); }); |
| ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); }); |
| ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); }); |
| |
| ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; }); |
| ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; }); |
| |
| // We have a default next state after emitting a tag, but the sink can override. |
| ( $me:ident : emit_tag $s:ident ) => ({ |
| $me.state = states::$s; |
| return $me.emit_current_tag(); |
| }); |
| |
| ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; }); |
| |
| // If nothing else matched, it's a single command |
| ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) ); |
| |
| // or nothing. |
| ( $me:ident : ) => (()); |
| ); |
| |
| macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => ( |
| match $x { |
| $($pats)|+ => go!($me: $($cmds)*), |
| _ => (), |
| } |
| )); |
| |
| // This is a macro because it can cause early return |
| // from the function where it is used. |
| macro_rules! get_char ( ($me:expr, $input:expr) => ( |
| unwrap_or_return!($me.get_char($input), ProcessResult::Suspend) |
| )); |
| |
| macro_rules! peek ( ($me:expr, $input:expr) => ( |
| unwrap_or_return!($me.peek($input), ProcessResult::Suspend) |
| )); |
| |
| macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( |
| unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend) |
| )); |
| |
| macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( |
| unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend) |
| )); |
| |
| macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( |
| unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend) |
| )); |
| |
| impl<Sink: TokenSink> Tokenizer<Sink> { |
| // Run the state machine for a while. |
| // Return true if we should be immediately re-invoked |
| // (this just simplifies control flow vs. break / continue). |
| #[allow(clippy::never_loop)] |
| fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> { |
| if self.char_ref_tokenizer.is_some() { |
| return self.step_char_ref_tokenizer(input); |
| } |
| |
| trace!("processing in state {:?}", self.state); |
| match self.state { |
| //§ data-state |
| states::Data => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\0'), |
| FromSet('&') => go!(self: consume_char_ref), |
| FromSet('<') => go!(self: to TagOpen), |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ rcdata-state |
| states::RawData(Rcdata) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\u{fffd}'), |
| FromSet('&') => go!(self: consume_char_ref), |
| FromSet('<') => go!(self: to RawLessThanSign Rcdata), |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ rawtext-state |
| states::RawData(Rawtext) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\u{fffd}'), |
| FromSet('<') => go!(self: to RawLessThanSign Rawtext), |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ script-data-state |
| states::RawData(ScriptData) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\u{fffd}'), |
| FromSet('<') => go!(self: to RawLessThanSign ScriptData), |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ script-data-escaped-state |
| states::RawData(ScriptDataEscaped(Escaped)) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\u{fffd}'), |
| FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped), |
| FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ script-data-double-escaped-state |
| states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\u{fffd}'), |
| FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped), |
| FromSet('<') => { |
| go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped) |
| }, |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ plaintext-state |
| states::Plaintext => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) { |
| FromSet('\0') => go!(self: error; emit '\u{fffd}'), |
| FromSet(c) => go!(self: emit c), |
| NotFromSet(b) => self.emit_chars(b), |
| } |
| }, |
| |
| //§ tag-open-state |
| states::TagOpen => loop { |
| match get_char!(self, input) { |
| '!' => go!(self: clear_temp; to MarkupDeclarationOpen), |
| '/' => go!(self: to EndTagOpen), |
| '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment), |
| c => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: create_tag StartTag cl; to TagName), |
| None => go!(self: error; emit '<'; reconsume Data), |
| }, |
| } |
| }, |
| |
| //§ end-tag-open-state |
| states::EndTagOpen => loop { |
| match get_char!(self, input) { |
| '>' => go!(self: error; to Data), |
| '\0' => { |
| go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) |
| }, |
| c => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: create_tag EndTag cl; to TagName), |
| None => go!(self: error; clear_comment; push_comment c; to BogusComment), |
| }, |
| } |
| }, |
| |
| //§ tag-name-state |
| states::TagName => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), |
| '/' => go!(self: to SelfClosingStartTag), |
| '>' => go!(self: emit_tag Data), |
| '\0' => go!(self: error; push_tag '\u{fffd}'), |
| c => go!(self: push_tag (c.to_ascii_lowercase())), |
| } |
| }, |
| |
| //§ script-data-escaped-less-than-sign-state |
| states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { |
| match get_char!(self, input) { |
| '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), |
| c => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c; |
| to ScriptDataEscapeStart DoubleEscaped), |
| None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped), |
| }, |
| } |
| }, |
| |
| //§ script-data-double-escaped-less-than-sign-state |
| states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { |
| match get_char!(self, input) { |
| '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd), |
| _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), |
| } |
| }, |
| |
| //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state |
| // otherwise |
| states::RawLessThanSign(kind) => loop { |
| match get_char!(self, input) { |
| '/' => go!(self: clear_temp; to RawEndTagOpen kind), |
| '!' if kind == ScriptData => { |
| go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped) |
| }, |
| _ => go!(self: emit '<'; reconsume RawData kind), |
| } |
| }, |
| |
| //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state |
| states::RawEndTagOpen(kind) => loop { |
| let c = get_char!(self, input); |
| match lower_ascii_letter(c) { |
| Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), |
| None => go!(self: emit '<'; emit '/'; reconsume RawData kind), |
| } |
| }, |
| |
| //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state |
| states::RawEndTagName(kind) => loop { |
| let c = get_char!(self, input); |
| if self.have_appropriate_end_tag() { |
| match c { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), |
| '/' => go!(self: to SelfClosingStartTag), |
| '>' => go!(self: emit_tag Data), |
| _ => (), |
| } |
| } |
| |
| match lower_ascii_letter(c) { |
| Some(cl) => go!(self: push_tag cl; push_temp c), |
| None => { |
| go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind) |
| }, |
| } |
| }, |
| |
| //§ script-data-double-escape-start-state |
| states::ScriptDataEscapeStart(DoubleEscaped) => loop { |
| let c = get_char!(self, input); |
| match c { |
| '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { |
| let esc = if &*self.temp_buf == "script" { |
| DoubleEscaped |
| } else { |
| Escaped |
| }; |
| go!(self: emit c; to RawData ScriptDataEscaped esc); |
| }, |
| _ => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: push_temp cl; emit c), |
| None => go!(self: reconsume RawData ScriptDataEscaped Escaped), |
| }, |
| } |
| }, |
| |
| //§ script-data-escape-start-state |
| states::ScriptDataEscapeStart(Escaped) => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash), |
| _ => go!(self: reconsume RawData ScriptData), |
| } |
| }, |
| |
| //§ script-data-escape-start-dash-state |
| states::ScriptDataEscapeStartDash => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped), |
| _ => go!(self: reconsume RawData ScriptData), |
| } |
| }, |
| |
| //§ script-data-escaped-dash-state script-data-double-escaped-dash-state |
| states::ScriptDataEscapedDash(kind) => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind), |
| '<' => { |
| if kind == DoubleEscaped { |
| go!(self: emit '<'); |
| } |
| go!(self: to RawLessThanSign ScriptDataEscaped kind); |
| }, |
| '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), |
| c => go!(self: emit c; to RawData ScriptDataEscaped kind), |
| } |
| }, |
| |
| //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state |
| states::ScriptDataEscapedDashDash(kind) => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: emit '-'), |
| '<' => { |
| if kind == DoubleEscaped { |
| go!(self: emit '<'); |
| } |
| go!(self: to RawLessThanSign ScriptDataEscaped kind); |
| }, |
| '>' => go!(self: emit '>'; to RawData ScriptData), |
| '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind), |
| c => go!(self: emit c; to RawData ScriptDataEscaped kind), |
| } |
| }, |
| |
| //§ script-data-double-escape-end-state |
| states::ScriptDataDoubleEscapeEnd => loop { |
| let c = get_char!(self, input); |
| match c { |
| '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => { |
| let esc = if &*self.temp_buf == "script" { |
| Escaped |
| } else { |
| DoubleEscaped |
| }; |
| go!(self: emit c; to RawData ScriptDataEscaped esc); |
| }, |
| _ => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: push_temp cl; emit c), |
| None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), |
| }, |
| } |
| }, |
| |
| //§ before-attribute-name-state |
| states::BeforeAttributeName => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '/' => go!(self: to SelfClosingStartTag), |
| '>' => go!(self: emit_tag Data), |
| '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), |
| c => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: create_attr cl; to AttributeName), |
| None => { |
| go_match!(self: c, |
| '"' , '\'' , '<' , '=' => error); |
| go!(self: create_attr c; to AttributeName); |
| }, |
| }, |
| } |
| }, |
| |
| //§ attribute-name-state |
| states::AttributeName => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName), |
| '/' => go!(self: to SelfClosingStartTag), |
| '=' => go!(self: to BeforeAttributeValue), |
| '>' => go!(self: emit_tag Data), |
| '\0' => go!(self: error; push_name '\u{fffd}'), |
| c => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: push_name cl), |
| None => { |
| go_match!(self: c, |
| '"' , '\'' , '<' => error); |
| go!(self: push_name c); |
| }, |
| }, |
| } |
| }, |
| |
| //§ after-attribute-name-state |
| states::AfterAttributeName => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '/' => go!(self: to SelfClosingStartTag), |
| '=' => go!(self: to BeforeAttributeValue), |
| '>' => go!(self: emit_tag Data), |
| '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName), |
| c => match lower_ascii_letter(c) { |
| Some(cl) => go!(self: create_attr cl; to AttributeName), |
| None => { |
| go_match!(self: c, |
| '"' , '\'' , '<' => error); |
| go!(self: create_attr c; to AttributeName); |
| }, |
| }, |
| } |
| }, |
| |
| //§ before-attribute-value-state |
| // Use peek so we can handle the first attr character along with the rest, |
| // hopefully in the same zero-copy buffer. |
| states::BeforeAttributeValue => loop { |
| match peek!(self, input) { |
| '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), |
| '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), |
| '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), |
| '\0' => { |
| go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) |
| }, |
| '>' => go!(self: discard_char input; error; emit_tag Data), |
| _ => go!(self: to AttributeValue Unquoted), |
| } |
| }, |
| |
| //§ attribute-value-(double-quoted)-state |
| states::AttributeValue(DoubleQuoted) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { |
| FromSet('"') => go!(self: to AfterAttributeValueQuoted), |
| FromSet('&') => go!(self: consume_char_ref '"'), |
| FromSet('\0') => go!(self: error; push_value '\u{fffd}'), |
| FromSet(c) => go!(self: push_value c), |
| NotFromSet(ref b) => go!(self: append_value b), |
| } |
| }, |
| |
| //§ attribute-value-(single-quoted)-state |
| states::AttributeValue(SingleQuoted) => loop { |
| match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { |
| FromSet('\'') => go!(self: to AfterAttributeValueQuoted), |
| FromSet('&') => go!(self: consume_char_ref '\''), |
| FromSet('\0') => go!(self: error; push_value '\u{fffd}'), |
| FromSet(c) => go!(self: push_value c), |
| NotFromSet(ref b) => go!(self: append_value b), |
| } |
| }, |
| |
| //§ attribute-value-(unquoted)-state |
| states::AttributeValue(Unquoted) => loop { |
| match pop_except_from!( |
| self, |
| input, |
| small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0') |
| ) { |
| FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { |
| go!(self: to BeforeAttributeName) |
| }, |
| FromSet('&') => go!(self: consume_char_ref '>'), |
| FromSet('>') => go!(self: emit_tag Data), |
| FromSet('\0') => go!(self: error; push_value '\u{fffd}'), |
| FromSet(c) => { |
| go_match!(self: c, |
| '"' , '\'' , '<' , '=' , '`' => error); |
| go!(self: push_value c); |
| }, |
| NotFromSet(ref b) => go!(self: append_value b), |
| } |
| }, |
| |
| //§ after-attribute-value-(quoted)-state |
| states::AfterAttributeValueQuoted => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), |
| '/' => go!(self: to SelfClosingStartTag), |
| '>' => go!(self: emit_tag Data), |
| _ => go!(self: error; reconsume BeforeAttributeName), |
| } |
| }, |
| |
| //§ self-closing-start-tag-state |
| states::SelfClosingStartTag => loop { |
| match get_char!(self, input) { |
| '>' => { |
| self.current_tag_self_closing = true; |
| go!(self: emit_tag Data); |
| }, |
| _ => go!(self: error; reconsume BeforeAttributeName), |
| } |
| }, |
| |
| //§ comment-start-state |
| states::CommentStart => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: to CommentStartDash), |
| '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment), |
| '>' => go!(self: error; emit_comment; to Data), |
| c => go!(self: push_comment c; to Comment), |
| } |
| }, |
| |
| //§ comment-start-dash-state |
| states::CommentStartDash => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: to CommentEnd), |
| '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), |
| '>' => go!(self: error; emit_comment; to Data), |
| c => go!(self: push_comment '-'; push_comment c; to Comment), |
| } |
| }, |
| |
| //§ comment-state |
| states::Comment => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: to CommentEndDash), |
| '\0' => go!(self: error; push_comment '\u{fffd}'), |
| c => go!(self: push_comment c), |
| } |
| }, |
| |
| //§ comment-end-dash-state |
| states::CommentEndDash => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: to CommentEnd), |
| '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment), |
| c => go!(self: push_comment '-'; push_comment c; to Comment), |
| } |
| }, |
| |
| //§ comment-end-state |
| states::CommentEnd => loop { |
| match get_char!(self, input) { |
| '>' => go!(self: emit_comment; to Data), |
| '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment), |
| '!' => go!(self: error; to CommentEndBang), |
| '-' => go!(self: error; push_comment '-'), |
| c => go!(self: error; append_comment "--"; push_comment c; to Comment), |
| } |
| }, |
| |
| //§ comment-end-bang-state |
| states::CommentEndBang => loop { |
| match get_char!(self, input) { |
| '-' => go!(self: append_comment "--!"; to CommentEndDash), |
| '>' => go!(self: emit_comment; to Data), |
| '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), |
| c => go!(self: append_comment "--!"; push_comment c; to Comment), |
| } |
| }, |
| |
| //§ doctype-state |
| states::Doctype => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), |
| _ => go!(self: error; reconsume BeforeDoctypeName), |
| } |
| }, |
| |
| //§ before-doctype-name-state |
| states::BeforeDoctypeName => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '\0' => { |
| go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName) |
| }, |
| '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), |
| c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); |
| to DoctypeName), |
| } |
| }, |
| |
| //§ doctype-name-state |
| states::DoctypeName => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), |
| '>' => go!(self: emit_doctype; to Data), |
| '\0' => go!(self: error; push_doctype_name '\u{fffd}'), |
| c => go!(self: push_doctype_name (c.to_ascii_lowercase())), |
| } |
| }, |
| |
| //§ after-doctype-name-state |
| states::AfterDoctypeName => loop { |
| if eat!(self, input, "public") { |
| go!(self: to AfterDoctypeKeyword Public); |
| } else if eat!(self, input, "system") { |
| go!(self: to AfterDoctypeKeyword System); |
| } else { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '>' => go!(self: emit_doctype; to Data), |
| _ => go!(self: error; force_quirks; to BogusDoctype), |
| } |
| } |
| }, |
| |
| //§ after-doctype-public-keyword-state after-doctype-system-keyword-state |
| states::AfterDoctypeKeyword(kind) => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), |
| '"' => { |
| go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) |
| }, |
| '\'' => { |
| go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) |
| }, |
| '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
| _ => go!(self: error; force_quirks; to BogusDoctype), |
| } |
| }, |
| |
| //§ before-doctype-public-identifier-state before-doctype-system-identifier-state |
| states::BeforeDoctypeIdentifier(kind) => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), |
| '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), |
| '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
| _ => go!(self: error; force_quirks; to BogusDoctype), |
| } |
| }, |
| |
| //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state |
| states::DoctypeIdentifierDoubleQuoted(kind) => loop { |
| match get_char!(self, input) { |
| '"' => go!(self: to AfterDoctypeIdentifier kind), |
| '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), |
| '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
| c => go!(self: push_doctype_id kind c), |
| } |
| }, |
| |
| //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state |
| states::DoctypeIdentifierSingleQuoted(kind) => loop { |
| match get_char!(self, input) { |
| '\'' => go!(self: to AfterDoctypeIdentifier kind), |
| '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'), |
| '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
| c => go!(self: push_doctype_id kind c), |
| } |
| }, |
| |
| //§ after-doctype-public-identifier-state |
| states::AfterDoctypeIdentifier(Public) => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => { |
| go!(self: to BetweenDoctypePublicAndSystemIdentifiers) |
| }, |
| '>' => go!(self: emit_doctype; to Data), |
| '"' => { |
| go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) |
| }, |
| '\'' => { |
| go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) |
| }, |
| _ => go!(self: error; force_quirks; to BogusDoctype), |
| } |
| }, |
| |
| //§ after-doctype-system-identifier-state |
| states::AfterDoctypeIdentifier(System) => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '>' => go!(self: emit_doctype; to Data), |
| _ => go!(self: error; to BogusDoctype), |
| } |
| }, |
| |
| //§ between-doctype-public-and-system-identifiers-state |
| states::BetweenDoctypePublicAndSystemIdentifiers => loop { |
| match get_char!(self, input) { |
| '\t' | '\n' | '\x0C' | ' ' => (), |
| '>' => go!(self: emit_doctype; to Data), |
| '"' => { |
| go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) |
| }, |
| '\'' => { |
| go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) |
| }, |
| _ => go!(self: error; force_quirks; to BogusDoctype), |
| } |
| }, |
| |
| //§ bogus-doctype-state |
| states::BogusDoctype => loop { |
| match get_char!(self, input) { |
| '>' => go!(self: emit_doctype; to Data), |
| _ => (), |
| } |
| }, |
| |
| //§ bogus-comment-state |
| states::BogusComment => loop { |
| match get_char!(self, input) { |
| '>' => go!(self: emit_comment; to Data), |
| '\0' => go!(self: push_comment '\u{fffd}'), |
| c => go!(self: push_comment c), |
| } |
| }, |
| |
| //§ markup-declaration-open-state |
| states::MarkupDeclarationOpen => loop { |
| if eat_exact!(self, input, "--") { |
| go!(self: clear_comment; to CommentStart); |
| } else if eat!(self, input, "doctype") { |
| go!(self: to Doctype); |
| } else { |
| if self |
| .sink |
| .adjusted_current_node_present_but_not_in_html_namespace() |
| { |
| if eat_exact!(self, input, "[CDATA[") { |
| go!(self: clear_temp; to CdataSection); |
| } |
| } |
| go!(self: error; to BogusComment); |
| } |
| }, |
| |
| //§ cdata-section-state |
| states::CdataSection => loop { |
| match get_char!(self, input) { |
| ']' => go!(self: to CdataSectionBracket), |
| '\0' => go!(self: emit_temp; emit '\0'), |
| c => go!(self: push_temp c), |
| } |
| }, |
| |
| //§ cdata-section-bracket |
| states::CdataSectionBracket => match get_char!(self, input) { |
| ']' => go!(self: to CdataSectionEnd), |
| _ => go!(self: push_temp ']'; reconsume CdataSection), |
| }, |
| |
| //§ cdata-section-end |
| states::CdataSectionEnd => loop { |
| match get_char!(self, input) { |
| ']' => go!(self: push_temp ']'), |
| '>' => go!(self: emit_temp; to Data), |
| _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection), |
| } |
| }, |
| //§ END |
| } |
| } |
| |
| fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> { |
| // FIXME HACK: Take and replace the tokenizer so we don't |
| // double-mut-borrow self. This is why it's boxed. |
| let mut tok = self.char_ref_tokenizer.take().unwrap(); |
| let outcome = tok.step(self, input); |
| |
| let progress = match outcome { |
| char_ref::Done => { |
| self.process_char_ref(tok.get_result()); |
| return ProcessResult::Continue; |
| }, |
| |
| char_ref::Stuck => ProcessResult::Suspend, |
| char_ref::Progress => ProcessResult::Continue, |
| }; |
| |
| self.char_ref_tokenizer = Some(tok); |
| progress |
| } |
| |
| fn process_char_ref(&mut self, char_ref: CharRef) { |
| let CharRef { |
| mut chars, |
| mut num_chars, |
| } = char_ref; |
| |
| if num_chars == 0 { |
| chars[0] = '&'; |
| num_chars = 1; |
| } |
| |
| for i in 0..num_chars { |
| let c = chars[i as usize]; |
| match self.state { |
| states::Data | states::RawData(states::Rcdata) => go!(self: emit c), |
| |
| states::AttributeValue(_) => go!(self: push_value c), |
| |
| _ => panic!( |
| "state {:?} should not be reachable in process_char_ref", |
| self.state |
| ), |
| } |
| } |
| } |
| |
| /// Indicate that we have reached the end of the input. |
| pub fn end(&mut self) { |
| // Handle EOF in the char ref sub-tokenizer, if there is one. |
| // Do this first because it might un-consume stuff. |
| let mut input = BufferQueue::new(); |
| match self.char_ref_tokenizer.take() { |
| None => (), |
| Some(mut tok) => { |
| tok.end_of_file(self, &mut input); |
| self.process_char_ref(tok.get_result()); |
| }, |
| } |
| |
| // Process all remaining buffered input. |
| // If we're waiting for lookahead, we're not gonna get it. |
| self.at_eof = true; |
| assert!(matches!(self.run(&mut input), TokenizerResult::Done)); |
| assert!(input.is_empty()); |
| |
| loop { |
| match self.eof_step() { |
| ProcessResult::Continue => (), |
| ProcessResult::Suspend => break, |
| ProcessResult::Script(_) => unreachable!(), |
| } |
| } |
| |
| self.sink.end(); |
| |
| if self.opts.profile { |
| self.dump_profile(); |
| } |
| } |
| |
| fn dump_profile(&self) { |
| let mut results: Vec<(states::State, u64)> = |
| self.state_profile.iter().map(|(s, t)| (*s, *t)).collect(); |
| results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); |
| |
| let total: u64 = results |
| .iter() |
| .map(|&(_, t)| t) |
| .fold(0, ::std::ops::Add::add); |
| println!("\nTokenizer profile, in nanoseconds"); |
| println!("\n{:12} total in token sink", self.time_in_sink); |
| println!("\n{:12} total in tokenizer", total); |
| |
| for (k, v) in results.into_iter() { |
| let pct = 100.0 * (v as f64) / (total as f64); |
| println!("{:12} {:4.1}% {:?}", v, pct, k); |
| } |
| } |
| |
| fn eof_step(&mut self) -> ProcessResult<Sink::Handle> { |
| debug!("processing EOF in state {:?}", self.state); |
| match self.state { |
| states::Data | |
| states::RawData(Rcdata) | |
| states::RawData(Rawtext) | |
| states::RawData(ScriptData) | |
| states::Plaintext => go!(self: eof), |
| |
| states::TagName | |
| states::RawData(ScriptDataEscaped(_)) | |
| states::BeforeAttributeName | |
| states::AttributeName | |
| states::AfterAttributeName | |
| states::BeforeAttributeValue | |
| states::AttributeValue(_) | |
| states::AfterAttributeValueQuoted | |
| states::SelfClosingStartTag | |
| states::ScriptDataEscapedDash(_) | |
| states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), |
| |
| states::TagOpen => go!(self: error_eof; emit '<'; to Data), |
| |
| states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data), |
| |
| states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { |
| go!(self: to RawData ScriptDataEscaped DoubleEscaped) |
| }, |
| |
| states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind), |
| |
| states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind), |
| |
| states::RawEndTagName(kind) => { |
| go!(self: emit '<'; emit '/'; emit_temp; to RawData kind) |
| }, |
| |
| states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), |
| |
| states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), |
| |
| states::ScriptDataDoubleEscapeEnd => { |
| go!(self: to RawData ScriptDataEscaped DoubleEscaped) |
| }, |
| |
| states::CommentStart | |
| states::CommentStartDash | |
| states::Comment | |
| states::CommentEndDash | |
| states::CommentEnd | |
| states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), |
| |
| states::Doctype | states::BeforeDoctypeName => { |
| go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) |
| }, |
| |
| states::DoctypeName | |
| states::AfterDoctypeName | |
| states::AfterDoctypeKeyword(_) | |
| states::BeforeDoctypeIdentifier(_) | |
| states::DoctypeIdentifierDoubleQuoted(_) | |
| states::DoctypeIdentifierSingleQuoted(_) | |
| states::AfterDoctypeIdentifier(_) | |
| states::BetweenDoctypePublicAndSystemIdentifiers => { |
| go!(self: error_eof; force_quirks; emit_doctype; to Data) |
| }, |
| |
| states::BogusDoctype => go!(self: emit_doctype; to Data), |
| |
| states::BogusComment => go!(self: emit_comment; to Data), |
| |
| states::MarkupDeclarationOpen => go!(self: error; to BogusComment), |
| |
| states::CdataSection => go!(self: emit_temp; error_eof; to Data), |
| |
| states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection), |
| |
| states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection), |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| #[allow(non_snake_case)] |
| mod test { |
| use super::option_push; // private items |
| use crate::tendril::{SliceExt, StrTendril}; |
| |
| use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; |
| |
| use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; |
| use super::interface::{EndTag, StartTag, Tag, TagKind}; |
| use super::interface::{TagToken, Token}; |
| |
| use markup5ever::buffer_queue::BufferQueue; |
| use std::mem::replace; |
| |
| use crate::LocalName; |
| |
| // LinesMatch implements the TokenSink trait. It is used for testing to see |
| // if current_line is being updated when process_token is called. The lines |
| // vector is a collection of the line numbers that each token is on. |
| struct LinesMatch { |
| tokens: Vec<Token>, |
| current_str: StrTendril, |
| lines: Vec<(Token, u64)>, |
| } |
| |
| impl LinesMatch { |
| fn new() -> LinesMatch { |
| LinesMatch { |
| tokens: vec![], |
| current_str: StrTendril::new(), |
| lines: vec![], |
| } |
| } |
| |
| fn push(&mut self, token: Token, line_number: u64) { |
| self.finish_str(); |
| self.lines.push((token, line_number)); |
| } |
| |
| fn finish_str(&mut self) { |
| if self.current_str.len() > 0 { |
| let s = replace(&mut self.current_str, StrTendril::new()); |
| self.tokens.push(CharacterTokens(s)); |
| } |
| } |
| } |
| |
| impl TokenSink for LinesMatch { |
| type Handle = (); |
| |
| fn process_token( |
| &mut self, |
| token: Token, |
| line_number: u64, |
| ) -> TokenSinkResult<Self::Handle> { |
| match token { |
| CharacterTokens(b) => { |
| self.current_str.push_slice(&b); |
| }, |
| |
| NullCharacterToken => { |
| self.current_str.push_char('\0'); |
| }, |
| |
| ParseError(_) => { |
| panic!("unexpected parse error"); |
| }, |
| |
| TagToken(mut t) => { |
| // The spec seems to indicate that one can emit |
| // erroneous end tags with attrs, but the test |
| // cases don't contain them. |
| match t.kind { |
| EndTag => { |
| t.self_closing = false; |
| t.attrs = vec![]; |
| }, |
| _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), |
| } |
| self.push(TagToken(t), line_number); |
| }, |
| |
| EOFToken => (), |
| |
| _ => self.push(token, line_number), |
| } |
| TokenSinkResult::Continue |
| } |
| } |
| |
| // Take in tokens, process them, and return vector with line |
| // numbers that each token is on |
| fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> { |
| let sink = LinesMatch::new(); |
| let mut tok = Tokenizer::new(sink, opts); |
| let mut buffer = BufferQueue::new(); |
| for chunk in input.into_iter() { |
| buffer.push_back(chunk); |
| let _ = tok.feed(&mut buffer); |
| } |
| tok.end(); |
| tok.sink.lines |
| } |
| |
| // Create a tag token |
| fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { |
| let name = LocalName::from(&*token); |
| let token = TagToken(Tag { |
| kind: tagkind, |
| name, |
| self_closing: false, |
| attrs: vec![], |
| }); |
| token |
| } |
| |
| #[test] |
| fn push_to_None_gives_singleton() { |
| let mut s: Option<StrTendril> = None; |
| option_push(&mut s, 'x'); |
| assert_eq!(s, Some("x".to_tendril())); |
| } |
| |
| #[test] |
| fn push_to_empty_appends() { |
| let mut s: Option<StrTendril> = Some(StrTendril::new()); |
| option_push(&mut s, 'x'); |
| assert_eq!(s, Some("x".to_tendril())); |
| } |
| |
| #[test] |
| fn push_to_nonempty_appends() { |
| let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y")); |
| option_push(&mut s, 'x'); |
| assert_eq!(s, Some("yx".to_tendril())); |
| } |
| |
| #[test] |
| fn check_lines() { |
| let opts = TokenizerOpts { |
| exact_errors: false, |
| discard_bom: true, |
| profile: false, |
| initial_state: None, |
| last_start_tag_name: None, |
| }; |
| let vector = vec![ |
| StrTendril::from("<a>\n"), |
| StrTendril::from("<b>\n"), |
| StrTendril::from("</b>\n"), |
| StrTendril::from("</a>\n"), |
| ]; |
| let expected = vec![ |
| (create_tag(StrTendril::from("a"), StartTag), 1), |
| (create_tag(StrTendril::from("b"), StartTag), 2), |
| (create_tag(StrTendril::from("b"), EndTag), 3), |
| (create_tag(StrTendril::from("a"), EndTag), 4), |
| ]; |
| let results = tokenize(vector, opts); |
| assert_eq!(results, expected); |
| } |
| |
| #[test] |
| fn check_lines_with_new_line() { |
| let opts = TokenizerOpts { |
| exact_errors: false, |
| discard_bom: true, |
| profile: false, |
| initial_state: None, |
| last_start_tag_name: None, |
| }; |
| let vector = vec![ |
| StrTendril::from("<a>\r\n"), |
| StrTendril::from("<b>\r\n"), |
| StrTendril::from("</b>\r\n"), |
| StrTendril::from("</a>\r\n"), |
| ]; |
| let expected = vec![ |
| (create_tag(StrTendril::from("a"), StartTag), 1), |
| (create_tag(StrTendril::from("b"), StartTag), 2), |
| (create_tag(StrTendril::from("b"), EndTag), 3), |
| (create_tag(StrTendril::from("a"), EndTag), 4), |
| ]; |
| let results = tokenize(vector, opts); |
| assert_eq!(results, expected); |
| } |
| } |