| //! Contains simple lexer for XML documents. |
| //! |
| //! This module is for internal use. Use `xml::pull` module to do parsing. |
| |
| use std::fmt; |
| use std::collections::VecDeque; |
| use std::io::Read; |
| use std::result; |
| use std::borrow::Cow; |
| |
| use common::{Position, TextPosition, is_whitespace_char, is_name_char}; |
| use reader::Error; |
| use util; |
| |
| /// `Token` represents a single lexeme of an XML document. These lexemes |
| /// are used to perform actual parsing. |
| #[derive(Copy, Clone, PartialEq, Eq, Debug)] |
| pub enum Token { |
| /// `<?` |
| ProcessingInstructionStart, |
| /// `?>` |
| ProcessingInstructionEnd, |
| /// `<!DOCTYPE |
| DoctypeStart, |
| /// `<` |
| OpeningTagStart, |
| /// `</` |
| ClosingTagStart, |
| /// `>` |
| TagEnd, |
| /// `/>` |
| EmptyTagEnd, |
| /// `<!--` |
| CommentStart, |
| /// `-->` |
| CommentEnd, |
| /// A chunk of characters, used for errors recovery. |
| Chunk(&'static str), |
| /// Any non-special character except whitespace. |
| Character(char), |
| /// Whitespace character. |
| Whitespace(char), |
| /// `=` |
| EqualsSign, |
| /// `'` |
| SingleQuote, |
| /// `"` |
| DoubleQuote, |
| /// `<![CDATA[` |
| CDataStart, |
| /// `]]>` |
| CDataEnd, |
| /// `&` |
| ReferenceStart, |
| /// `;` |
| ReferenceEnd, |
| } |
| |
| impl fmt::Display for Token { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match *self { |
| Token::Chunk(s) => write!(f, "{}", s), |
| Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), |
| other => write!(f, "{}", match other { |
| Token::OpeningTagStart => "<", |
| Token::ProcessingInstructionStart => "<?", |
| Token::DoctypeStart => "<!DOCTYPE", |
| Token::ClosingTagStart => "</", |
| Token::CommentStart => "<!--", |
| Token::CDataStart => "<![CDATA[", |
| Token::TagEnd => ">", |
| Token::EmptyTagEnd => "/>", |
| Token::ProcessingInstructionEnd => "?>", |
| Token::CommentEnd => "-->", |
| Token::CDataEnd => "]]>", |
| Token::ReferenceStart => "&", |
| Token::ReferenceEnd => ";", |
| Token::EqualsSign => "=", |
| Token::SingleQuote => "'", |
| Token::DoubleQuote => "\"", |
| _ => unreachable!() |
| }) |
| } |
| } |
| } |
| |
| impl Token { |
| pub fn as_static_str(&self) -> Option<&'static str> { |
| match *self { |
| Token::OpeningTagStart => Some("<"), |
| Token::ProcessingInstructionStart => Some("<?"), |
| Token::DoctypeStart => Some("<!DOCTYPE"), |
| Token::ClosingTagStart => Some("</"), |
| Token::CommentStart => Some("<!--"), |
| Token::CDataStart => Some("<![CDATA["), |
| Token::TagEnd => Some(">"), |
| Token::EmptyTagEnd => Some("/>"), |
| Token::ProcessingInstructionEnd => Some("?>"), |
| Token::CommentEnd => Some("-->"), |
| Token::CDataEnd => Some("]]>"), |
| Token::ReferenceStart => Some("&"), |
| Token::ReferenceEnd => Some(";"), |
| Token::EqualsSign => Some("="), |
| Token::SingleQuote => Some("'"), |
| Token::DoubleQuote => Some("\""), |
| Token::Chunk(s) => Some(s), |
| _ => None |
| } |
| } |
| |
| // using String.push_str(token.to_string()) is simply way too slow |
| pub fn push_to_string(&self, target: &mut String) { |
| match self.as_static_str() { |
| Some(s) => { target.push_str(s); } |
| None => { |
| match *self { |
| Token::Character(c) | Token::Whitespace(c) => target.push(c), |
| _ => unreachable!() |
| } |
| } |
| } |
| } |
| |
| /// Returns `true` if this token contains data that can be interpreted |
| /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. |
| #[inline] |
| pub fn contains_char_data(&self) -> bool { |
| match *self { |
| Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | |
| Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | |
| Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, |
| _ => false |
| } |
| } |
| |
| /// Returns `true` if this token corresponds to a white space character. |
| #[inline] |
| pub fn is_whitespace(&self) -> bool { |
| match *self { |
| Token::Whitespace(_) => true, |
| _ => false |
| } |
| } |
| } |
| |
| enum State { |
| /// Triggered on '<' |
| TagStarted, |
| /// Triggered on '<!' |
| CommentOrCDataOrDoctypeStarted, |
| /// Triggered on '<!-' |
| CommentStarted, |
| /// Triggered on '<!D' up to '<!DOCTYPE' |
| DoctypeStarted(DoctypeStartedSubstate), |
| /// Triggered after DoctypeStarted to handle sub elements |
| DoctypeFinishing(u8), |
| /// Triggered on '<![' up to '<![CDATA' |
| CDataStarted(CDataStartedSubstate), |
| /// Triggered on '?' |
| ProcessingInstructionClosing, |
| /// Triggered on '/' |
| EmptyTagClosing, |
| /// Triggered on '-' up to '--' |
| CommentClosing(ClosingSubstate), |
| /// Triggered on ']' up to ']]' |
| CDataClosing(ClosingSubstate), |
| /// Default state |
| Normal |
| } |
| |
| #[derive(Copy, Clone)] |
| enum ClosingSubstate { |
| First, Second |
| } |
| |
| #[derive(Copy, Clone)] |
| enum DoctypeStartedSubstate { |
| D, DO, DOC, DOCT, DOCTY, DOCTYP |
| } |
| |
| #[derive(Copy, Clone)] |
| enum CDataStartedSubstate { |
| E, C, CD, CDA, CDAT, CDATA |
| } |
| |
| /// `Result` represents lexing result. It is either a token or an error message. |
| pub type Result = result::Result<Option<Token>, Error>; |
| |
| /// Helps to set up a dispatch table for lexing large unambigous tokens like |
| /// `<![CDATA[` or `<!DOCTYPE `. |
| macro_rules! dispatch_on_enum_state( |
| ($_self:ident, $s:expr, $c:expr, $is:expr, |
| $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; |
| $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( |
| match $s { |
| $( |
| $st => match $c { |
| $stc => $_self.move_to($is($next_st)), |
| _ => $_self.handle_error($chunk, $c) |
| }, |
| )+ |
| $end_st => match $c { |
| $end_c => $e, |
| _ => $_self.handle_error($end_chunk, $c) |
| } |
| } |
| ) |
| ); |
| |
| /// `Lexer` is a lexer for XML documents, which implements pull API. |
| /// |
| /// Main method is `next_token` which accepts an `std::io::Read` instance and |
| /// tries to read the next lexeme from it. |
| /// |
| /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. |
| /// When it is not set, errors will be reported as `Err` objects with a string message. |
| /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods |
| /// to toggle the behavior. |
| pub struct Lexer { |
| pos: TextPosition, |
| head_pos: TextPosition, |
| char_queue: VecDeque<char>, |
| st: State, |
| skip_errors: bool, |
| inside_comment: bool, |
| inside_token: bool, |
| eof_handled: bool |
| } |
| |
| impl Position for Lexer { |
| #[inline] |
| /// Returns the position of the last token produced by the lexer |
| fn position(&self) -> TextPosition { self.pos } |
| } |
| |
| impl Lexer { |
| /// Returns a new lexer with default state. |
| pub fn new() -> Lexer { |
| Lexer { |
| pos: TextPosition::new(), |
| head_pos: TextPosition::new(), |
| char_queue: VecDeque::with_capacity(4), // TODO: check size |
| st: State::Normal, |
| skip_errors: false, |
| inside_comment: false, |
| inside_token: false, |
| eof_handled: false |
| } |
| } |
| |
| /// Enables error handling so `next_token` will return `Some(Err(..))` |
| /// upon invalid lexeme. |
| #[inline] |
| pub fn enable_errors(&mut self) { self.skip_errors = false; } |
| |
| /// Disables error handling so `next_token` will return `Some(Chunk(..))` |
| /// upon invalid lexeme with this lexeme content. |
| #[inline] |
| pub fn disable_errors(&mut self) { self.skip_errors = true; } |
| |
| /// Enables special handling of some lexemes which should be done when we're parsing comment |
| /// internals. |
| #[inline] |
| pub fn inside_comment(&mut self) { self.inside_comment = true; } |
| |
| /// Disables the effect of `inside_comment()` method. |
| #[inline] |
| pub fn outside_comment(&mut self) { self.inside_comment = false; } |
| |
| /// Reset the eof handled flag of the lexer. |
| #[inline] |
| pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } |
| |
| /// Tries to read the next token from the buffer. |
| /// |
| /// It is possible to pass different instaces of `BufReader` each time |
| /// this method is called, but the resulting behavior is undefined in this case. |
| /// |
| /// Return value: |
| /// * `Err(reason) where reason: reader::Error` - when an error occurs; |
| /// * `Ok(None)` - upon end of stream is reached; |
| /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. |
| pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { |
| // Already reached end of buffer |
| if self.eof_handled { |
| return Ok(None); |
| } |
| |
| if !self.inside_token { |
| self.pos = self.head_pos; |
| self.inside_token = true; |
| } |
| |
| // Check if we have saved a char or two for ourselves |
| while let Some(c) = self.char_queue.pop_front() { |
| match try!(self.read_next_token(c)) { |
| Some(t) => { |
| self.inside_token = false; |
| return Ok(Some(t)); |
| } |
| None => {} // continue |
| } |
| } |
| |
| loop { |
| // TODO: this should handle multiple encodings |
| let c = match try!(util::next_char_from(b)) { |
| Some(c) => c, // got next char |
| None => break, // nothing to read left |
| }; |
| |
| match try!(self.read_next_token(c)) { |
| Some(t) => { |
| self.inside_token = false; |
| return Ok(Some(t)); |
| } |
| None => { |
| // continue |
| } |
| } |
| } |
| |
| // Handle end of stream |
| self.eof_handled = true; |
| self.pos = self.head_pos; |
| match self.st { |
| State::TagStarted | State::CommentOrCDataOrDoctypeStarted | |
| State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | |
| State::CommentClosing(ClosingSubstate::Second) | |
| State::DoctypeFinishing(_) => |
| Err(self.error("Unexpected end of stream")), |
| State::ProcessingInstructionClosing => |
| Ok(Some(Token::Character('?'))), |
| State::EmptyTagClosing => |
| Ok(Some(Token::Character('/'))), |
| State::CommentClosing(ClosingSubstate::First) => |
| Ok(Some(Token::Character('-'))), |
| State::CDataClosing(ClosingSubstate::First) => |
| Ok(Some(Token::Character(']'))), |
| State::CDataClosing(ClosingSubstate::Second) => |
| Ok(Some(Token::Chunk("]]"))), |
| State::Normal => |
| Ok(None) |
| } |
| } |
| |
| #[inline] |
| fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { |
| (self, msg).into() |
| } |
| |
| #[inline] |
| fn read_next_token(&mut self, c: char) -> Result { |
| let res = self.dispatch_char(c); |
| if self.char_queue.is_empty() { |
| if c == '\n' { |
| self.head_pos.new_line(); |
| } else { |
| self.head_pos.advance(1); |
| } |
| } |
| res |
| } |
| |
| fn dispatch_char(&mut self, c: char) -> Result { |
| match self.st { |
| State::Normal => self.normal(c), |
| State::TagStarted => self.tag_opened(c), |
| State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), |
| State::CommentStarted => self.comment_started(c), |
| State::CDataStarted(s) => self.cdata_started(c, s), |
| State::DoctypeStarted(s) => self.doctype_started(c, s), |
| State::DoctypeFinishing(d) => self.doctype_finishing(c, d), |
| State::ProcessingInstructionClosing => self.processing_instruction_closing(c), |
| State::EmptyTagClosing => self.empty_element_closing(c), |
| State::CommentClosing(s) => self.comment_closing(c, s), |
| State::CDataClosing(s) => self.cdata_closing(c, s) |
| } |
| } |
| |
| #[inline] |
| fn move_to(&mut self, st: State) -> Result { |
| self.st = st; |
| Ok(None) |
| } |
| |
| #[inline] |
| fn move_to_with(&mut self, st: State, token: Token) -> Result { |
| self.st = st; |
| Ok(Some(token)) |
| } |
| |
| #[inline] |
| fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { |
| self.char_queue.extend(cs.iter().cloned()); |
| self.move_to_with(st, token) |
| } |
| |
| fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { |
| self.char_queue.push_back(c); |
| if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky |
| self.move_to_with(State::Normal, Token::Chunk(chunk)) |
| } else { |
| Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) |
| } |
| } |
| |
| /// Encountered a char |
| fn normal(&mut self, c: char) -> Result { |
| match c { |
| '<' => self.move_to(State::TagStarted), |
| '>' => Ok(Some(Token::TagEnd)), |
| '/' => self.move_to(State::EmptyTagClosing), |
| '=' => Ok(Some(Token::EqualsSign)), |
| '"' => Ok(Some(Token::DoubleQuote)), |
| '\'' => Ok(Some(Token::SingleQuote)), |
| '?' => self.move_to(State::ProcessingInstructionClosing), |
| '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), |
| ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), |
| '&' => Ok(Some(Token::ReferenceStart)), |
| ';' => Ok(Some(Token::ReferenceEnd)), |
| _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), |
| _ => Ok(Some(Token::Character(c))) |
| } |
| } |
| |
| /// Encountered '<' |
| fn tag_opened(&mut self, c: char) -> Result { |
| match c { |
| '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), |
| '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), |
| '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), |
| _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), |
| _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), |
| _ => self.handle_error("<", c) |
| } |
| } |
| |
| /// Encountered '<!' |
| fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { |
| match c { |
| '-' => self.move_to(State::CommentStarted), |
| '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), |
| 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), |
| _ => self.handle_error("<!", c) |
| } |
| } |
| |
| /// Encountered '<!-' |
| fn comment_started(&mut self, c: char) -> Result { |
| match c { |
| '-' => self.move_to_with(State::Normal, Token::CommentStart), |
| _ => self.handle_error("<!-", c) |
| } |
| } |
| |
| /// Encountered '<![' |
| fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { |
| use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; |
| dispatch_on_enum_state!(self, s, c, State::CDataStarted, |
| E ; 'C' ; C ; "<![", |
| C ; 'D' ; CD ; "<![C", |
| CD ; 'A' ; CDA ; "<![CD", |
| CDA ; 'T' ; CDAT ; "<![CDA", |
| CDAT ; 'A' ; CDATA ; "<![CDAT"; |
| CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) |
| ) |
| } |
| |
| /// Encountered '<!D' |
| fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { |
| use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; |
| dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, |
| D ; 'O' ; DO ; "<!D", |
| DO ; 'C' ; DOC ; "<!DO", |
| DOC ; 'T' ; DOCT ; "<!DOC", |
| DOCT ; 'Y' ; DOCTY ; "<!DOCT", |
| DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; |
| DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) |
| ) |
| } |
| |
| /// State used while awaiting the closing bracket for the <!DOCTYPE tag |
| fn doctype_finishing(&mut self, c: char, d: u8) -> Result { |
| match c { |
| '<' => self.move_to(State::DoctypeFinishing(d + 1)), |
| '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), |
| '>' => self.move_to(State::DoctypeFinishing(d - 1)), |
| _ => Ok(None), |
| } |
| } |
| |
| /// Encountered '?' |
| fn processing_instruction_closing(&mut self, c: char) -> Result { |
| match c { |
| '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), |
| _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), |
| } |
| } |
| |
| /// Encountered '/' |
| fn empty_element_closing(&mut self, c: char) -> Result { |
| match c { |
| '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), |
| _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), |
| } |
| } |
| |
| /// Encountered '-' |
| fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { |
| match s { |
| ClosingSubstate::First => match c { |
| '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), |
| _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) |
| }, |
| ClosingSubstate::Second => match c { |
| '>' => self.move_to_with(State::Normal, Token::CommentEnd), |
| // double dash not followed by a greater-than is a hard error inside comment |
| _ if self.inside_comment => self.handle_error("--", c), |
| // nothing else except comment closing starts with a double dash, and comment |
| // closing can never be after another dash, and also we're outside of a comment, |
| // therefore it is safe to push only the last read character to the list of unread |
| // characters and pass the double dash directly to the output |
| _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) |
| } |
| } |
| } |
| |
| /// Encountered ']' |
| fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { |
| match s { |
| ClosingSubstate::First => match c { |
| ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), |
| _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) |
| }, |
| ClosingSubstate::Second => match c { |
| '>' => self.move_to_with(State::Normal, Token::CDataEnd), |
| _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) |
| } |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use common::{Position}; |
| use std::io::{BufReader, Cursor}; |
| |
| use super::{Lexer, Token}; |
| |
| macro_rules! assert_oks( |
| (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ |
| $( |
| assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); |
| )+ |
| }) |
| ); |
| |
| macro_rules! assert_err( |
| (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ |
| let err = $lex.next_token(&mut $buf); |
| assert!(err.is_err()); |
| let err = err.unwrap_err(); |
| assert_eq!($r as u64, err.position().row); |
| assert_eq!($c as u64, err.position().column); |
| assert_eq!($s, err.msg()); |
| }) |
| ); |
| |
| macro_rules! assert_none( |
| (for $lex:ident and $buf:ident) => ( |
| assert_eq!(Ok(None), $lex.next_token(&mut $buf)); |
| ) |
| ); |
| |
| fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { |
| (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) |
| } |
| |
| #[test] |
| fn simple_lexer_test() { |
| let (mut lex, mut buf) = make_lex_and_buf( |
| r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# |
| ); |
| |
| assert_oks!(for lex and buf ; |
| Token::OpeningTagStart |
| Token::Character('a') |
| Token::Whitespace(' ') |
| Token::Character('p') |
| Token::EqualsSign |
| Token::SingleQuote |
| Token::Character('q') |
| Token::SingleQuote |
| Token::TagEnd |
| Token::Whitespace(' ') |
| Token::Character('x') |
| Token::OpeningTagStart |
| Token::Character('b') |
| Token::Whitespace(' ') |
| Token::Character('z') |
| Token::EqualsSign |
| Token::DoubleQuote |
| Token::Character('y') |
| Token::DoubleQuote |
| Token::TagEnd |
| Token::Character('d') |
| Token::Whitespace('\t') |
| Token::ClosingTagStart |
| Token::Character('b') |
| Token::TagEnd |
| Token::ClosingTagStart |
| Token::Character('a') |
| Token::TagEnd |
| Token::OpeningTagStart |
| Token::Character('p') |
| Token::EmptyTagEnd |
| Token::Whitespace(' ') |
| Token::ProcessingInstructionStart |
| Token::Character('n') |
| Token::Character('m') |
| Token::Whitespace(' ') |
| Token::ProcessingInstructionEnd |
| Token::Whitespace(' ') |
| Token::CommentStart |
| Token::Whitespace(' ') |
| Token::Character('a') |
| Token::Whitespace(' ') |
| Token::Character('c') |
| Token::Whitespace(' ') |
| Token::CommentEnd |
| Token::Whitespace(' ') |
| Token::ReferenceStart |
| Token::Character('n') |
| Token::Character('b') |
| Token::Character('s') |
| Token::Character('p') |
| Token::ReferenceEnd |
| ); |
| assert_none!(for lex and buf); |
| } |
| |
| #[test] |
| fn special_chars_test() { |
| let (mut lex, mut buf) = make_lex_and_buf( |
| r#"?x!+ // -| ]z]]"# |
| ); |
| |
| assert_oks!(for lex and buf ; |
| Token::Character('?') |
| Token::Character('x') |
| Token::Character('!') |
| Token::Character('+') |
| Token::Whitespace(' ') |
| Token::Character('/') |
| Token::Character('/') |
| Token::Whitespace(' ') |
| Token::Character('-') |
| Token::Character('|') |
| Token::Whitespace(' ') |
| Token::Character(']') |
| Token::Character('z') |
| Token::Chunk("]]") |
| ); |
| assert_none!(for lex and buf); |
| } |
| |
| #[test] |
| fn cdata_test() { |
| let (mut lex, mut buf) = make_lex_and_buf( |
| r#"<a><![CDATA[x y ?]]> </a>"# |
| ); |
| |
| assert_oks!(for lex and buf ; |
| Token::OpeningTagStart |
| Token::Character('a') |
| Token::TagEnd |
| Token::CDataStart |
| Token::Character('x') |
| Token::Whitespace(' ') |
| Token::Character('y') |
| Token::Whitespace(' ') |
| Token::Character('?') |
| Token::CDataEnd |
| Token::Whitespace(' ') |
| Token::ClosingTagStart |
| Token::Character('a') |
| Token::TagEnd |
| ); |
| assert_none!(for lex and buf); |
| } |
| |
| #[test] |
| fn doctype_test() { |
| let (mut lex, mut buf) = make_lex_and_buf( |
| r#"<a><!DOCTYPE ab xx z> "# |
| ); |
| assert_oks!(for lex and buf ; |
| Token::OpeningTagStart |
| Token::Character('a') |
| Token::TagEnd |
| Token::DoctypeStart |
| Token::TagEnd |
| Token::Whitespace(' ') |
| ); |
| assert_none!(for lex and buf) |
| } |
| |
| #[test] |
| fn doctype_with_internal_subset_test() { |
| let (mut lex, mut buf) = make_lex_and_buf( |
| r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# |
| ); |
| assert_oks!(for lex and buf ; |
| Token::OpeningTagStart |
| Token::Character('a') |
| Token::TagEnd |
| Token::DoctypeStart |
| Token::TagEnd |
| Token::Whitespace(' ') |
| ); |
| assert_none!(for lex and buf) |
| } |
| |
| #[test] |
| fn end_of_stream_handling_ok() { |
| macro_rules! eof_check( |
| ($data:expr ; $token:expr) => ({ |
| let (mut lex, mut buf) = make_lex_and_buf($data); |
| assert_oks!(for lex and buf ; $token); |
| assert_none!(for lex and buf); |
| }) |
| ); |
| eof_check!("?" ; Token::Character('?')); |
| eof_check!("/" ; Token::Character('/')); |
| eof_check!("-" ; Token::Character('-')); |
| eof_check!("]" ; Token::Character(']')); |
| eof_check!("]]" ; Token::Chunk("]]")); |
| } |
| |
| #[test] |
| fn end_of_stream_handling_error() { |
| macro_rules! eof_check( |
| ($data:expr; $r:expr, $c:expr) => ({ |
| let (mut lex, mut buf) = make_lex_and_buf($data); |
| assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); |
| assert_none!(for lex and buf); |
| }) |
| ); |
| eof_check!("<" ; 0, 1); |
| eof_check!("<!" ; 0, 2); |
| eof_check!("<!-" ; 0, 3); |
| eof_check!("<![" ; 0, 3); |
| eof_check!("<![C" ; 0, 4); |
| eof_check!("<![CD" ; 0, 5); |
| eof_check!("<![CDA" ; 0, 6); |
| eof_check!("<![CDAT" ; 0, 7); |
| eof_check!("<![CDATA" ; 0, 8); |
| eof_check!("--" ; 0, 2); |
| } |
| |
| #[test] |
| fn error_in_comment_or_cdata_prefix() { |
| let (mut lex, mut buf) = make_lex_and_buf("<!x"); |
| assert_err!(for lex and buf expect row 0 ; 0, |
| "Unexpected token '<!' before 'x'" |
| ); |
| |
| let (mut lex, mut buf) = make_lex_and_buf("<!x"); |
| lex.disable_errors(); |
| assert_oks!(for lex and buf ; |
| Token::Chunk("<!") |
| Token::Character('x') |
| ); |
| assert_none!(for lex and buf); |
| } |
| |
| #[test] |
| fn error_in_comment_started() { |
| let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); |
| assert_err!(for lex and buf expect row 0 ; 0, |
| "Unexpected token '<!-' before '\t'" |
| ); |
| |
| let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); |
| lex.disable_errors(); |
| assert_oks!(for lex and buf ; |
| Token::Chunk("<!-") |
| Token::Whitespace('\t') |
| ); |
| assert_none!(for lex and buf); |
| } |
| |
| #[test] |
| fn error_in_comment_two_dashes_not_at_end() { |
| let (mut lex, mut buf) = make_lex_and_buf("--x"); |
| lex.inside_comment(); |
| assert_err!(for lex and buf expect row 0; 0, |
| "Unexpected token '--' before 'x'" |
| ); |
| |
| let (mut lex, mut buf) = make_lex_and_buf("--x"); |
| assert_oks!(for lex and buf ; |
| Token::Chunk("--") |
| Token::Character('x') |
| ); |
| } |
| |
| macro_rules! check_case( |
| ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ |
| let (mut lex, mut buf) = make_lex_and_buf($data); |
| assert_err!(for lex and buf expect row $r ; $c, $s); |
| |
| let (mut lex, mut buf) = make_lex_and_buf($data); |
| lex.disable_errors(); |
| assert_oks!(for lex and buf ; |
| Token::Chunk($chunk) |
| Token::Character($app) |
| ); |
| assert_none!(for lex and buf); |
| }) |
| ); |
| |
| #[test] |
| fn error_in_cdata_started() { |
| check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); |
| check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); |
| check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); |
| check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); |
| check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); |
| check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); |
| } |
| |
| #[test] |
| fn error_in_doctype_started() { |
| check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); |
| check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); |
| check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); |
| check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); |
| check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); |
| check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); |
| } |
| |
| |
| |
| #[test] |
| fn issue_98_cdata_ending_with_right_bracket() { |
| let (mut lex, mut buf) = make_lex_and_buf( |
| r#"<![CDATA[Foo [Bar]]]>"# |
| ); |
| |
| assert_oks!(for lex and buf ; |
| Token::CDataStart |
| Token::Character('F') |
| Token::Character('o') |
| Token::Character('o') |
| Token::Whitespace(' ') |
| Token::Character('[') |
| Token::Character('B') |
| Token::Character('a') |
| Token::Character('r') |
| Token::Character(']') |
| Token::CDataEnd |
| ); |
| assert_none!(for lex and buf); |
| } |
| } |