| #[macro_use] extern crate html5ever; |
| #[macro_use] extern crate lazy_static; |
| |
| use html5ever::{driver as html, QualName}; |
| use html5ever::rcdom::{Handle, NodeData, RcDom}; |
| use html5ever::serialize::{serialize, SerializeOpts}; |
| use pulldown_cmark::{Parser, Options}; |
| |
| use std::collections::HashSet; |
| use std::mem; |
| use std::rc::{Rc, Weak}; |
| use tendril::stream::TendrilSink; |
| use regex::Regex; |
| |
| mod suite; |
| |
| #[inline(never)] |
| pub fn test_markdown_html(input: &str, output: &str) { |
| let mut s = String::new(); |
| |
| let mut opts = Options::empty(); |
| opts.insert(Options::ENABLE_TABLES); |
| opts.insert(Options::ENABLE_FOOTNOTES); |
| opts.insert(Options::ENABLE_STRIKETHROUGH); |
| opts.insert(Options::ENABLE_TASKLISTS); |
| |
| let p = Parser::new_ext(input, opts); |
| pulldown_cmark::html::push_html(&mut s, p); |
| |
| assert_eq!(normalize_html(output), normalize_html(&s)); |
| } |
| |
| lazy_static! { |
| static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap(); |
| static ref LEADING_WHITESPACE_RE: Regex = Regex::new(r"\A\s+").unwrap(); |
| static ref TRAILING_WHITESPACE_RE: Regex = Regex::new(r"\s+\z").unwrap(); |
| static ref BLOCK_TAGS: HashSet<&'static str> = [ |
| "article", "header", "aside", "hgroup", "blockquote", "hr", "iframe", "body", "li", |
| "map", "button", "object", "canvas", "ol", "caption", "output", "col", "p", "colgroup", |
| "pre", "dd", "progress", "div", "section", "dl", "table", "td", "dt", "tbody", "embed", |
| "textarea", "fieldset", "tfoot", "figcaption", "th", "figure", "thead", "footer", "tr", |
| "form", "ul", "h1", "h2", "h3", "h4", "h5", "h6", "video", "script", "style" |
| ].iter().cloned().collect(); |
| static ref PRE_TAGS: HashSet<&'static str> = [ |
| "pre", "code" |
| ].iter().cloned().collect(); |
| static ref TABLE_TAGS: HashSet<&'static str> = [ |
| "table", "thead", "tbody", "tr", "td" |
| ].iter().cloned().collect(); |
| } |
| |
| fn make_html_parser() -> html::Parser<RcDom> { |
| html::parse_fragment( |
| RcDom::default(), |
| html::ParseOpts::default(), |
| QualName::new(None, ns!(html), local_name!("div")), |
| vec![], |
| ) |
| } |
| |
| fn normalize_html(s: &str) -> String { |
| let parser = make_html_parser(); |
| let dom = parser.one(s); |
| let body = normalize_dom(&dom); |
| let opts = SerializeOpts::default(); |
| let mut ret_val = Vec::new(); |
| serialize(&mut ret_val, &body, opts) |
| .expect("Writing to a string shouldn't fail (expect on OOM)"); |
| String::from_utf8(ret_val) |
| .expect("html5ever should always produce UTF8") |
| } |
| |
| fn normalize_dom(dom: &RcDom) -> Handle { |
| let body = { |
| let children = dom.document.children.borrow(); |
| children[0].clone() |
| }; |
| let mut current_level = Vec::new(); |
| let mut next_level = Vec::new(); |
| current_level.extend( |
| body.children.borrow() |
| .iter() |
| .cloned() |
| .rev() |
| ); |
| loop { |
| while let Some(mut node) = current_level.pop() { |
| let parent = node.parent.replace(None); |
| node.parent.replace(parent.clone()); |
| let parent = parent |
| .expect("a node in the DOM will have a parent, except the root, which is not processed") |
| .upgrade().expect("a node's parent will be pointed to by its parent (or the root pointer), and will not be dropped"); |
| let retain = normalize_node(&parent, &mut node); |
| if !retain { |
| let mut siblings = parent.children.borrow_mut(); |
| siblings.retain(|s| !Rc::ptr_eq(&node, s)); |
| } else { |
| next_level.extend( |
| node.children.borrow() |
| .iter() |
| .cloned() |
| .rev(), |
| ); |
| } |
| } |
| if next_level.is_empty() { break }; |
| mem::swap(&mut next_level, &mut current_level); |
| } |
| body |
| } |
| |
| // Returns false if node is an empty text node or an empty tbody. |
| // Returns true otherwise. |
| fn normalize_node(parent: &Handle, node: &mut Handle) -> bool { |
| match node.data { |
| NodeData::Comment { .. } | |
| NodeData::Doctype { .. } | |
| NodeData::Document | |
| NodeData::ProcessingInstruction { .. } => true, |
| NodeData::Text { ref contents, .. } => { |
| let mut contents = contents.borrow_mut(); |
| let is_pre = { |
| let mut parent = parent.clone(); |
| loop { |
| let is_pre = if let NodeData::Element{ ref name, .. } = parent.data { |
| PRE_TAGS.contains(&&*name.local.to_ascii_lowercase()) |
| } else { |
| false |
| }; |
| if is_pre { break true }; |
| let parent_ = parent.parent.replace(None); |
| parent.parent.replace(parent_.clone()); |
| let parent_ = parent_.as_ref().and_then(Weak::upgrade); |
| if let Some(parent_) = parent_ { |
| parent = parent_ |
| } else { |
| break false |
| }; |
| } |
| }; |
| if !is_pre { |
| let (is_first_in_block, is_last_in_block) = { |
| let mut is_first_in_block = true; |
| let mut is_last_in_block = true; |
| let mut parent = parent.clone(); |
| let mut node = node.clone(); |
| loop { |
| let reached_block = if let NodeData::Element{ ref name, .. } = parent.data { |
| BLOCK_TAGS.contains(&&*name.local.to_ascii_lowercase()) |
| } else { |
| false |
| }; |
| let (is_first, is_last) = { |
| let siblings = parent.children.borrow(); |
| let n = &node; |
| (siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false), siblings.len() > 0 && siblings.get(siblings.len() - 1).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false)) |
| }; |
| is_first_in_block = is_first_in_block && is_first; |
| is_last_in_block = is_last_in_block && is_last; |
| if (is_first_in_block || is_last_in_block) && !reached_block { |
| node = parent.clone(); |
| let parent_ = parent.parent.replace(None); |
| parent.parent.replace(parent_.clone()); |
| let parent_ = parent_.as_ref().and_then(Weak::upgrade); |
| if let Some(parent_) = parent_ { |
| parent = parent_; |
| } else { |
| break (is_first_in_block, is_last_in_block) |
| } |
| } else { |
| break (is_first_in_block, is_last_in_block) |
| } |
| } |
| }; |
| let is_preceeded_by_ws = { |
| let mut parent = parent.clone(); |
| let mut node = node.clone(); |
| 'ascent: loop { |
| let is_first = { |
| let siblings = parent.children.borrow(); |
| let n = &node; |
| siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false) |
| }; |
| if is_first { |
| node = parent.clone(); |
| let parent_ = parent.parent.replace(None); |
| parent.parent.replace(parent_.clone()); |
| let parent_ = parent_.as_ref().and_then(Weak::upgrade); |
| if let Some(parent_) = parent_ { |
| parent = parent_; |
| } else { |
| break 'ascent false |
| } |
| } else { |
| let siblings = parent.children.borrow(); |
| let n = &node; |
| let mut pos = !0; |
| 'search: for (i, s) in siblings.iter().enumerate() { |
| if Rc::ptr_eq(s, n) { |
| pos = i; |
| break 'search; |
| } |
| } |
| assert!(pos != !0, "The list of node's parent's children shall contain node"); |
| assert!(pos != 0, "If node is not first, then node's position shall not be zero"); |
| let mut preceeding = siblings[pos-1].clone(); |
| 'descent: loop { |
| if let NodeData::Text { .. } = preceeding.data { |
| break 'descent |
| } |
| preceeding = { |
| let ch = preceeding.children.borrow(); |
| if ch.len() == 0 { break 'descent } |
| if let Some(preceeding_) = ch.get(ch.len() - 1) { |
| preceeding_.clone() |
| } else { |
| break 'descent |
| } |
| }; |
| } |
| if let NodeData::Text { ref contents, .. } = preceeding.data { |
| break 'ascent TRAILING_WHITESPACE_RE.is_match(&*contents.borrow()) |
| } else { |
| break 'ascent false |
| } |
| } |
| } |
| }; |
| |
| let is_in_table = if let NodeData::Element{ ref name, .. } = parent.data { |
| TABLE_TAGS.contains(&&*name.local.to_ascii_lowercase()) |
| } else { |
| false |
| }; |
| let whitespace_replacement = if is_in_table { |
| "" |
| } else { |
| " " |
| }; |
| *contents = WHITESPACE_RE.replace_all(&*contents, whitespace_replacement).as_ref().into(); |
| |
| if is_first_in_block || is_preceeded_by_ws { |
| *contents = LEADING_WHITESPACE_RE.replace_all(&*contents, "").as_ref().into(); |
| } |
| if is_last_in_block { |
| *contents = TRAILING_WHITESPACE_RE.replace_all(&*contents, "").as_ref().into(); |
| } |
| // TODO: collapse whitespace when adjacent to whitespace. |
| // For example, the whitespace in the span should be collapsed in all of these cases: |
| // |
| // " <span> q </span> " |
| // "<b>q </b><span> q</span>" |
| // "<b>q <i></i></b><span> q</span>" |
| // "<b>q <i></i></b><span> q</span>" |
| // "q <b></b><span> q</span>" |
| } |
| &**contents != "" |
| } |
| NodeData::Element { ref attrs, ref name, .. } => { |
| let mut attrs = attrs.borrow_mut(); |
| for a in attrs.iter_mut() { |
| a.name.local = a.name.local.to_ascii_lowercase().into(); |
| } |
| attrs.sort_by(|a: &html5ever::Attribute, b: &html5ever::Attribute| { |
| (&*a.name.local).cmp(&*b.name.local) |
| }); |
| let ascii_name = &*name.local.to_ascii_lowercase(); |
| // drop empty tbody's |
| ascii_name != "tbody" || |
| node.children.borrow().len() > 1 || |
| node.children.borrow().iter().next().map(|only_child| match only_child.data { |
| NodeData::Text { ref contents, .. } => { |
| !contents.borrow().chars().all(|c| c.is_whitespace()) |
| } |
| _ => { |
| true |
| } |
| }).unwrap_or(false) |
| } |
| } |
| } |
| |
| |
| #[test] |
| fn strip_div_newline() { |
| assert_eq!("<div></div>", normalize_html("<div>\n</div>")); |
| } |
| |
| #[test] |
| fn strip_end_newline() { |
| assert_eq!("test", normalize_html("test\n")); |
| } |
| |
| #[test] |
| fn strip_double_space() { |
| assert_eq!("test mess", normalize_html("test mess")); |
| } |
| |
| #[test] |
| fn strip_inline_internal_text() { |
| assert_eq!("<u>a </u>b <u>c</u>", normalize_html("<u> a </u> b <u> c </u>")) |
| } |
| |
| #[test] |
| fn strip_inline_block_internal_text() { |
| assert_eq!("<u>a </u>b <u>c</u>", normalize_html(" <u> a </u> b <u> c </u> ")) |
| } |
| |
| #[test] |
| fn leaves_necessary_whitespace_alone() { |
| assert_eq!("<u>a</u> b <u>c</u>", normalize_html("<u>a</u> b <u>c</u>")) |
| } |
| |
| #[test] |
| fn leaves_necessary_whitespace_alone_weird() { |
| assert_eq!("<u>a </u>b <u>c</u>", normalize_html(" <u>a </u>b <u>c</u>")) |
| } |
| |
| #[test] |
| fn leaves_necessary_whitespace_all_nested() { |
| assert_eq!("<u></u><u></u><u></u><u></u>", normalize_html("<u> </u><u> </u><u> </u><u> </u>")) |
| } |
| |
| #[test] |
| fn drops_empty_tbody() { |
| assert_eq!( |
| "<table><thead><tr><td>hi</td></tr></thead></table>", |
| normalize_html("<table><thead><tr><td>hi</td></tr></thead><tbody> </tbody></table>") |
| ) |
| } |
| |
| #[test] |
| fn leaves_nonempty_tbody() { |
| let input = "<table><thead><tr><td>hi</td></tr></thead><tbody><tr></tr></tbody></table>"; |
| assert_eq!(input, normalize_html(input)) |
| } |