vendor/markup5ever_rcdom/tests/html-tokenizer.rs - toolchain/rustc - Git at Google

 // Copyright 2014-2017 The html5ever Project Developers. See the
 // COPYRIGHT file at the top-level directory of this distribution.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 mod foreach_html5lib_test;

 use foreach_html5lib_test::foreach_html5lib_test;
 use html5ever::tendril::*;
 use html5ever::tokenizer::states::{Plaintext, RawData, Rawtext, Rcdata};
 use html5ever::tokenizer::BufferQueue;
 use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
 use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
 use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
 use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
 use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
 use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
 use serde_json::{Map, Value};
 use std::borrow::Cow::Borrowed;
 use std::default::Default;
 use std::ffi::OsStr;
 use std::io::Read;
 use std::mem::replace;
 use std::path::Path;
 use std::{char, env};

 // Return all ways of splitting the string into at most n
 // possibly-empty pieces.
 fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
     if n == 1 {
         return vec![vec![s.to_tendril()]];
     }

     let mut points: Vec<usize> = s.char_indices().map(|(n, _)| n).collect();
     points.push(s.len());

     // do this with iterators?
     let mut out = vec![];
     for p in points.into_iter() {
         let y = &s[p..];
         for mut x in splits(&s[..p], n - 1).into_iter() {
             x.push(y.to_tendril());
             out.push(x);
         }
     }

     out.extend(splits(s, n - 1).into_iter());
     out
 }

 struct TokenLogger {
     tokens: Vec<Token>,
     current_str: StrTendril,
     exact_errors: bool,
 }

 impl TokenLogger {
     fn new(exact_errors: bool) -> TokenLogger {
         TokenLogger {
             tokens: vec![],
             current_str: StrTendril::new(),
             exact_errors: exact_errors,
         }
     }

     // Push anything other than character tokens
     fn push(&mut self, token: Token) {
         self.finish_str();
         self.tokens.push(token);
     }

     fn finish_str(&mut self) {
         if self.current_str.len() > 0 {
             let s = replace(&mut self.current_str, StrTendril::new());
             self.tokens.push(CharacterTokens(s));
         }
     }

     fn get_tokens(mut self) -> Vec<Token> {
         self.finish_str();
         self.tokens
     }
 }

 impl TokenSink for TokenLogger {
     type Handle = ();

     fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
         match token {
             CharacterTokens(b) => {
                 self.current_str.push_slice(&b);
             },

             NullCharacterToken => {
                 self.current_str.push_char('\0');
             },

             ParseError(_) => {
                 if self.exact_errors {
                     self.push(ParseError(Borrowed("")));
                 }
             },

             TagToken(mut t) => {
                 // The spec seems to indicate that one can emit
                 // erroneous end tags with attrs, but the test
                 // cases don't contain them.
                 match t.kind {
                     EndTag => {
                         t.self_closing = false;
                         t.attrs = vec![];
                     },
                     _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
                 }
                 self.push(TagToken(t));
             },

             EOFToken => (),

             _ => self.push(token),
         }
         TokenSinkResult::Continue
     }
 }

 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token> {
     let sink = TokenLogger::new(opts.exact_errors);
     let mut tok = Tokenizer::new(sink, opts);
     let mut buffer = BufferQueue::new();
     for chunk in input.into_iter() {
         buffer.push_back(chunk);
         let _ = tok.feed(&mut buffer);
     }
     let _ = tok.feed(&mut buffer);
     tok.end();
     tok.sink.get_tokens()
 }

 trait JsonExt: Sized {
     fn get_str(&self) -> String;
     fn get_tendril(&self) -> StrTendril;
     fn get_nullable_tendril(&self) -> Option<StrTendril>;
     fn get_bool(&self) -> bool;
     fn get_obj<'t>(&'t self) -> &'t Map<String, Self>;
     fn get_list<'t>(&'t self) -> &'t Vec<Self>;
     fn find<'t>(&'t self, key: &str) -> &'t Self;
 }

 impl JsonExt for Value {
     fn get_str(&self) -> String {
         match *self {
             Value::String(ref s) => s.to_string(),
             _ => panic!("Value::get_str: not a String"),
         }
     }

     fn get_tendril(&self) -> StrTendril {
         match *self {
             Value::String(ref s) => s.to_tendril(),
             _ => panic!("Value::get_tendril: not a String"),
         }
     }

     fn get_nullable_tendril(&self) -> Option<StrTendril> {
         match *self {
             Value::Null => None,
             Value::String(ref s) => Some(s.to_tendril()),
             _ => panic!("Value::get_nullable_tendril: not a String"),
         }
     }

     fn get_bool(&self) -> bool {
         match *self {
             Value::Bool(b) => b,
             _ => panic!("Value::get_bool: not a Bool"),
         }
     }

     fn get_obj<'t>(&'t self) -> &'t Map<String, Value> {
         match *self {
             Value::Object(ref m) => &*m,
             _ => panic!("Value::get_obj: not an Object"),
         }
     }

     fn get_list<'t>(&'t self) -> &'t Vec<Value> {
         match *self {
             Value::Array(ref m) => m,
             _ => panic!("Value::get_list: not an Array"),
         }
     }

     fn find<'t>(&'t self, key: &str) -> &'t Value {
         self.get_obj().get(&key.to_string()).unwrap()
     }
 }

 // Parse a JSON object (other than "ParseError") to a token.
 fn json_to_token(js: &Value) -> Token {
     let parts = js.get_list();
     // Collect refs here so we don't have to use "ref" in all the patterns below.
     let args: Vec<&Value> = parts[1..].iter().collect();
     match &*parts[0].get_str() {
         "DOCTYPE" => DoctypeToken(Doctype {
             name: args[0].get_nullable_tendril(),
             public_id: args[1].get_nullable_tendril(),
             system_id: args[2].get_nullable_tendril(),
             force_quirks: !args[3].get_bool(),
         }),

         "StartTag" => TagToken(Tag {
             kind: StartTag,
             name: LocalName::from(&*args[0].get_str()),
             attrs: args[1]
                 .get_obj()
                 .iter()
                 .map(|(k, v)| Attribute {
                     name: QualName::new(None, ns!(), LocalName::from(&**k)),
                     value: v.get_tendril(),
                 })
                 .collect(),
             self_closing: match args.get(2) {
                 Some(b) => b.get_bool(),
                 None => false,
             },
         }),

         "EndTag" => TagToken(Tag {
             kind: EndTag,
             name: LocalName::from(&*args[0].get_str()),
             attrs: vec![],
             self_closing: false,
         }),

         "Comment" => CommentToken(args[0].get_tendril()),

         "Character" => CharacterTokens(args[0].get_tendril()),

         // We don't need to produce NullCharacterToken because
         // the TokenLogger will convert them to CharacterTokens.
         _ => panic!("don't understand token {:?}", parts),
     }
 }

 // Parse the "output" field of the test case into a vector of tokens.
 fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
     // Use a TokenLogger so that we combine character tokens separated
     // by an ignored error.
     let mut sink = TokenLogger::new(exact_errors);
     for tok in js.get_list().iter() {
         assert_eq!(
             match *tok {
                 Value::String(ref s) if &s[..] == "ParseError" => {
                     sink.process_token(ParseError(Borrowed("")), 0)
                 },
                 _ => sink.process_token(json_to_token(tok), 0),
             },
             TokenSinkResult::Continue
         );
     }
     sink.get_tokens()
 }

 // Undo the escaping in "doubleEscaped" tests.
 fn unescape(s: &str) -> Option<String> {
     let mut out = String::with_capacity(s.len());
     let mut it = s.chars().peekable();
     loop {
         match it.next() {
             None => return Some(out),
             Some('\\') => {
                 if it.peek() != Some(&'u') {
                     panic!("can't understand escape");
                 }
                 drop(it.next());
                 let hex: String = it.by_ref().take(4).collect();
                 match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
                     // Some of the tests use lone surrogates, but we have no
                     // way to represent them in the UTF-8 input to our parser.
                     // Since these can only come from script, we will catch
                     // them there.
                     None => return None,
                     Some(c) => out.push(c),
                 }
             },
             Some(c) => out.push(c),
         }
     }
 }

 fn unescape_json(js: &Value) -> Value {
     match *js {
         // unwrap is OK here because the spec'd *output* of the tokenizer never
         // contains a lone surrogate.
         Value::String(ref s) => Value::String(unescape(&s).unwrap()),
         Value::Array(ref xs) => Value::Array(xs.iter().map(unescape_json).collect()),
         Value::Object(ref obj) => {
             let mut new_obj = Map::new();
             for (k, v) in obj.iter() {
                 new_obj.insert(k.clone(), unescape_json(v));
             }
             Value::Object(new_obj)
         },
         _ => js.clone(),
     }
 }

 fn mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn {
     TestDescAndFn {
         desc: TestDesc::new(DynTestName(desc)),
         testfn: DynTestFn(Box::new(move || {
             // Split up the input at different points to test incremental tokenization.
             let insplits = splits(&input, 3);
             for input in insplits.into_iter() {
                 // Clone 'input' so we have it for the failure message.
                 // Also clone opts.  If we don't, we get the wrong
                 // result but the compiler doesn't catch it!
                 // Possibly mozilla/rust#12223.
                 let output = tokenize(input.clone(), opts.clone());
                 let expect_toks = json_to_tokens(&expect, opts.exact_errors);
                 if output != expect_toks {
                     panic!(
                         "\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
                         input, output, expect
                     );
                 }
             }
         })),
     }
 }

 fn mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value) {
     let obj = js.get_obj();
     let mut input = js.find("input").get_str();
     let mut expect = js.find("output").clone();
     let desc = format!("tok: {}: {}", filename, js.find("description").get_str());

     // "Double-escaped" tests require additional processing of
     // the input and output.
     if obj
         .get(&"doubleEscaped".to_string())
         .map_or(false, |j| j.get_bool())
     {
         match unescape(&input) {
             None => return,
             Some(i) => input = i,
         }
         expect = unescape_json(&expect);
     }

     // Some tests have a last start tag name.
     let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str());

     // Some tests want to start in a state other than Data.
     let state_overrides = match obj.get(&"initialStates".to_string()) {
         Some(&Value::Array(ref xs)) => xs
             .iter()
             .map(|s| {
                 Some(match &s.get_str()[..] {
                     "PLAINTEXT state" => Plaintext,
                     "RAWTEXT state" => RawData(Rawtext),
                     "RCDATA state" => RawData(Rcdata),
                     s => panic!("don't know state {}", s),
                 })
             })
             .collect(),
         None => vec![None],
         _ => panic!("don't understand initialStates value"),
     };

     // Build the tests.
     for state in state_overrides.into_iter() {
         for &exact_errors in [false, true].iter() {
             let mut newdesc = desc.clone();
             match state {
                 Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
                 None => (),
             };
             if exact_errors {
                 newdesc = format!("{} (exact errors)", newdesc);
             }

             tests.push(mk_test(
                 newdesc,
                 input.clone(),
                 expect.clone(),
                 TokenizerOpts {
                     exact_errors: exact_errors,
                     initial_state: state,
                     last_start_tag_name: start_tag.clone(),

                     // Not discarding a BOM is what the test suite expects; see
                     // https://github.com/html5lib/html5lib-tests/issues/2
                     discard_bom: false,

                     ..Default::default()
                 },
             ));
         }
     }
 }

 fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
     let mut tests = vec![];

     foreach_html5lib_test(
         src_dir,
         "tokenizer",
         OsStr::new("test"),
         |path, mut file| {
             let mut s = String::new();
             file.read_to_string(&mut s)
                 .ok()
                 .expect("file reading error");
             let js: Value = serde_json::from_str(&s).ok().expect("json parse error");

             match js.get_obj().get(&"tests".to_string()) {
                 Some(&Value::Array(ref lst)) => {
                     for test in lst.iter() {
                         mk_tests(
                             &mut tests,
                             path.file_name().unwrap().to_str().unwrap(),
                             test,
                         );
                     }
                 },

                 // xmlViolation.test doesn't follow this format.
                 _ => (),
             }
         },
     );

     tests
 }

 fn main() {
     let args: Vec<_> = env::args().collect();
     rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
 }
	// Copyright 2014-2017 The html5ever Project Developers. See the
	// COPYRIGHT file at the top-level directory of this distribution.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	mod foreach_html5lib_test;

	use foreach_html5lib_test::foreach_html5lib_test;
	use html5ever::tendril::*;
	use html5ever::tokenizer::states::{Plaintext, RawData, Rawtext, Rcdata};
	use html5ever::tokenizer::BufferQueue;
	use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
	use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
	use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
	use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
	use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
	use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
	use serde_json::{Map, Value};
	use std::borrow::Cow::Borrowed;
	use std::default::Default;
	use std::ffi::OsStr;
	use std::io::Read;
	use std::mem::replace;
	use std::path::Path;
	use std::{char, env};

	// Return all ways of splitting the string into at most n
	// possibly-empty pieces.
	fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
	if n == 1 {
	return vec![vec![s.to_tendril()]];
	}

	let mut points: Vec<usize> = s.char_indices().map(\|(n, _)\| n).collect();
	points.push(s.len());

	// do this with iterators?
	let mut out = vec![];
	for p in points.into_iter() {
	let y = &s[p..];
	for mut x in splits(&s[..p], n - 1).into_iter() {
	x.push(y.to_tendril());
	out.push(x);
	}
	}

	out.extend(splits(s, n - 1).into_iter());
	out
	}

	struct TokenLogger {
	tokens: Vec<Token>,
	current_str: StrTendril,
	exact_errors: bool,
	}

	impl TokenLogger {
	fn new(exact_errors: bool) -> TokenLogger {
	TokenLogger {
	tokens: vec![],
	current_str: StrTendril::new(),
	exact_errors: exact_errors,
	}
	}

	// Push anything other than character tokens
	fn push(&mut self, token: Token) {
	self.finish_str();
	self.tokens.push(token);
	}

	fn finish_str(&mut self) {
	if self.current_str.len() > 0 {
	let s = replace(&mut self.current_str, StrTendril::new());
	self.tokens.push(CharacterTokens(s));
	}
	}

	fn get_tokens(mut self) -> Vec<Token> {
	self.finish_str();
	self.tokens
	}
	}

	impl TokenSink for TokenLogger {
	type Handle = ();

	fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
	match token {
	CharacterTokens(b) => {
	self.current_str.push_slice(&b);
	},

	NullCharacterToken => {
	self.current_str.push_char('\0');
	},

	ParseError(_) => {
	if self.exact_errors {
	self.push(ParseError(Borrowed("")));
	}
	},

	TagToken(mut t) => {
	// The spec seems to indicate that one can emit
	// erroneous end tags with attrs, but the test
	// cases don't contain them.
	match t.kind {
	EndTag => {
	t.self_closing = false;
	t.attrs = vec![];
	},
	_ => t.attrs.sort_by(\|a1, a2\| a1.name.cmp(&a2.name)),
	}
	self.push(TagToken(t));
	},

	EOFToken => (),

	_ => self.push(token),
	}
	TokenSinkResult::Continue
	}
	}

	fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token> {
	let sink = TokenLogger::new(opts.exact_errors);
	let mut tok = Tokenizer::new(sink, opts);
	let mut buffer = BufferQueue::new();
	for chunk in input.into_iter() {
	buffer.push_back(chunk);
	let _ = tok.feed(&mut buffer);
	}
	let _ = tok.feed(&mut buffer);
	tok.end();
	tok.sink.get_tokens()
	}

	trait JsonExt: Sized {
	fn get_str(&self) -> String;
	fn get_tendril(&self) -> StrTendril;
	fn get_nullable_tendril(&self) -> Option<StrTendril>;
	fn get_bool(&self) -> bool;
	fn get_obj<'t>(&'t self) -> &'t Map<String, Self>;
	fn get_list<'t>(&'t self) -> &'t Vec<Self>;
	fn find<'t>(&'t self, key: &str) -> &'t Self;
	}

	impl JsonExt for Value {
	fn get_str(&self) -> String {
	match *self {
	Value::String(ref s) => s.to_string(),
	_ => panic!("Value::get_str: not a String"),
	}
	}

	fn get_tendril(&self) -> StrTendril {
	match *self {
	Value::String(ref s) => s.to_tendril(),
	_ => panic!("Value::get_tendril: not a String"),
	}
	}

	fn get_nullable_tendril(&self) -> Option<StrTendril> {
	match *self {
	Value::Null => None,
	Value::String(ref s) => Some(s.to_tendril()),
	_ => panic!("Value::get_nullable_tendril: not a String"),
	}
	}

	fn get_bool(&self) -> bool {
	match *self {
	Value::Bool(b) => b,
	_ => panic!("Value::get_bool: not a Bool"),
	}
	}

	fn get_obj<'t>(&'t self) -> &'t Map<String, Value> {
	match *self {
	Value::Object(ref m) => &*m,
	_ => panic!("Value::get_obj: not an Object"),
	}
	}

	fn get_list<'t>(&'t self) -> &'t Vec<Value> {
	match *self {
	Value::Array(ref m) => m,
	_ => panic!("Value::get_list: not an Array"),
	}
	}

	fn find<'t>(&'t self, key: &str) -> &'t Value {
	self.get_obj().get(&key.to_string()).unwrap()
	}
	}

	// Parse a JSON object (other than "ParseError") to a token.
	fn json_to_token(js: &Value) -> Token {
	let parts = js.get_list();
	// Collect refs here so we don't have to use "ref" in all the patterns below.
	let args: Vec<&Value> = parts[1..].iter().collect();
	match &*parts[0].get_str() {
	"DOCTYPE" => DoctypeToken(Doctype {
	name: args[0].get_nullable_tendril(),
	public_id: args[1].get_nullable_tendril(),
	system_id: args[2].get_nullable_tendril(),
	force_quirks: !args[3].get_bool(),
	}),

	"StartTag" => TagToken(Tag {
	kind: StartTag,
	name: LocalName::from(&*args[0].get_str()),
	attrs: args[1]
	.get_obj()
	.iter()
	.map(\|(k, v)\| Attribute {
	name: QualName::new(None, ns!(), LocalName::from(&**k)),
	value: v.get_tendril(),
	})
	.collect(),
	self_closing: match args.get(2) {
	Some(b) => b.get_bool(),
	None => false,
	},
	}),

	"EndTag" => TagToken(Tag {
	kind: EndTag,
	name: LocalName::from(&*args[0].get_str()),
	attrs: vec![],
	self_closing: false,
	}),

	"Comment" => CommentToken(args[0].get_tendril()),

	"Character" => CharacterTokens(args[0].get_tendril()),

	// We don't need to produce NullCharacterToken because
	// the TokenLogger will convert them to CharacterTokens.
	_ => panic!("don't understand token {:?}", parts),
	}
	}

	// Parse the "output" field of the test case into a vector of tokens.
	fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
	// Use a TokenLogger so that we combine character tokens separated
	// by an ignored error.
	let mut sink = TokenLogger::new(exact_errors);
	for tok in js.get_list().iter() {
	assert_eq!(
	match *tok {
	Value::String(ref s) if &s[..] == "ParseError" => {
	sink.process_token(ParseError(Borrowed("")), 0)
	},
	_ => sink.process_token(json_to_token(tok), 0),
	},
	TokenSinkResult::Continue
	);
	}
	sink.get_tokens()
	}

	// Undo the escaping in "doubleEscaped" tests.
	fn unescape(s: &str) -> Option<String> {
	let mut out = String::with_capacity(s.len());
	let mut it = s.chars().peekable();
	loop {
	match it.next() {
	None => return Some(out),
	Some('\\') => {
	if it.peek() != Some(&'u') {
	panic!("can't understand escape");
	}
	drop(it.next());
	let hex: String = it.by_ref().take(4).collect();
	match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
	// Some of the tests use lone surrogates, but we have no
	// way to represent them in the UTF-8 input to our parser.
	// Since these can only come from script, we will catch
	// them there.
	None => return None,
	Some(c) => out.push(c),
	}
	},
	Some(c) => out.push(c),
	}
	}
	}

	fn unescape_json(js: &Value) -> Value {
	match *js {
	// unwrap is OK here because the spec'd output of the tokenizer never
	// contains a lone surrogate.
	Value::String(ref s) => Value::String(unescape(&s).unwrap()),
	Value::Array(ref xs) => Value::Array(xs.iter().map(unescape_json).collect()),
	Value::Object(ref obj) => {
	let mut new_obj = Map::new();
	for (k, v) in obj.iter() {
	new_obj.insert(k.clone(), unescape_json(v));
	}
	Value::Object(new_obj)
	},
	_ => js.clone(),
	}
	}

	fn mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn {
	TestDescAndFn {
	desc: TestDesc::new(DynTestName(desc)),
	testfn: DynTestFn(Box::new(move \|\| {
	// Split up the input at different points to test incremental tokenization.
	let insplits = splits(&input, 3);
	for input in insplits.into_iter() {
	// Clone 'input' so we have it for the failure message.
	// Also clone opts. If we don't, we get the wrong
	// result but the compiler doesn't catch it!
	// Possibly mozilla/rust#12223.
	let output = tokenize(input.clone(), opts.clone());
	let expect_toks = json_to_tokens(&expect, opts.exact_errors);
	if output != expect_toks {
	panic!(
	"\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
	input, output, expect
	);
	}
	}
	})),
	}
	}

	fn mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value) {
	let obj = js.get_obj();
	let mut input = js.find("input").get_str();
	let mut expect = js.find("output").clone();
	let desc = format!("tok: {}: {}", filename, js.find("description").get_str());

	// "Double-escaped" tests require additional processing of
	// the input and output.
	if obj
	.get(&"doubleEscaped".to_string())
	.map_or(false, \|j\| j.get_bool())
	{
	match unescape(&input) {
	None => return,
	Some(i) => input = i,
	}
	expect = unescape_json(&expect);
	}

	// Some tests have a last start tag name.
	let start_tag = obj.get(&"lastStartTag".to_string()).map(\|s\| s.get_str());

	// Some tests want to start in a state other than Data.
	let state_overrides = match obj.get(&"initialStates".to_string()) {
	Some(&Value::Array(ref xs)) => xs
	.iter()
	.map(\|s\| {
	Some(match &s.get_str()[..] {
	"PLAINTEXT state" => Plaintext,
	"RAWTEXT state" => RawData(Rawtext),
	"RCDATA state" => RawData(Rcdata),
	s => panic!("don't know state {}", s),
	})
	})
	.collect(),
	None => vec![None],
	_ => panic!("don't understand initialStates value"),
	};

	// Build the tests.
	for state in state_overrides.into_iter() {
	for &exact_errors in [false, true].iter() {
	let mut newdesc = desc.clone();
	match state {
	Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
	None => (),
	};
	if exact_errors {
	newdesc = format!("{} (exact errors)", newdesc);
	}

	tests.push(mk_test(
	newdesc,
	input.clone(),
	expect.clone(),
	TokenizerOpts {
	exact_errors: exact_errors,
	initial_state: state,
	last_start_tag_name: start_tag.clone(),

	// Not discarding a BOM is what the test suite expects; see
	// https://github.com/html5lib/html5lib-tests/issues/2
	discard_bom: false,

	..Default::default()
	},
	));
	}
	}
	}

	fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
	let mut tests = vec![];

	foreach_html5lib_test(
	src_dir,
	"tokenizer",
	OsStr::new("test"),
	\|path, mut file\| {
	let mut s = String::new();
	file.read_to_string(&mut s)
	.ok()
	.expect("file reading error");
	let js: Value = serde_json::from_str(&s).ok().expect("json parse error");

	match js.get_obj().get(&"tests".to_string()) {
	Some(&Value::Array(ref lst)) => {
	for test in lst.iter() {
	mk_tests(
	&mut tests,
	path.file_name().unwrap().to_str().unwrap(),
	test,
	);
	}
	},

	// xmlViolation.test doesn't follow this format.
	_ => (),
	}
	},
	);

	tests
	}

	fn main() {
	let args: Vec<_> = env::args().collect();
	rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
	}