| use crate::Result; |
| use pest::{iterators::Pair, iterators::Pairs, Parser}; |
| use serde::Serialize; |
| use std::default::Default; |
| |
| use crate::error::Error; |
| use crate::grammar::Grammar; |
| use crate::Rule; |
| |
| pub mod element; |
| pub mod formatting; |
| pub mod node; |
| pub mod span; |
| |
| use crate::dom::span::SourceSpan; |
| use element::{Element, ElementVariant}; |
| use node::Node; |
| |
| /// Document, DocumentFragment or Empty |
| #[derive(Debug, Clone, PartialEq, Serialize)] |
| #[serde(rename_all = "camelCase")] |
| pub enum DomVariant { |
| /// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html. |
| /// Example: |
| /// ```text |
| /// <!doctype html> |
| /// <html> |
| /// <head></head> |
| /// <body> |
| /// <h1>Hello world</h1> |
| /// </body> |
| /// </html> |
| /// ``` |
| Document, |
| /// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head. |
| /// Example: |
| /// ```text |
| /// <h1>Hello world</h1> |
| /// ``` |
| DocumentFragment, |
| /// An empty dom means that the input was empty |
| Empty, |
| } |
| |
| /// **The main struct** & the result of the parsed html |
| #[derive(Debug, Clone, Serialize, PartialEq)] |
| #[serde(rename_all = "camelCase")] |
| pub struct Dom { |
| /// The type of the tree that was parsed |
| pub tree_type: DomVariant, |
| |
| /// All of the root children in the tree |
| #[serde(skip_serializing_if = "Vec::is_empty")] |
| pub children: Vec<Node>, |
| |
| /// A collection of all errors during parsing |
| #[serde(skip_serializing)] |
| pub errors: Vec<String>, |
| } |
| |
| impl Default for Dom { |
| fn default() -> Self { |
| Self { |
| tree_type: DomVariant::Empty, |
| children: vec![], |
| errors: vec![], |
| } |
| } |
| } |
| |
| impl Dom { |
| pub fn parse(input: &str) -> Result<Self> { |
| let pairs = match Grammar::parse(Rule::html, input) { |
| Ok(pairs) => pairs, |
| Err(error) => return formatting::error_msg(error), |
| }; |
| Self::build_dom(pairs) |
| } |
| |
| pub fn to_json(&self) -> Result<String> { |
| Ok(serde_json::to_string(self)?) |
| } |
| |
| pub fn to_json_pretty(&self) -> Result<String> { |
| Ok(serde_json::to_string_pretty(self)?) |
| } |
| |
| fn build_dom(pairs: Pairs<Rule>) -> Result<Self> { |
| let mut dom = Self::default(); |
| |
| // NOTE: The logic is roughly as follows: |
| // 1) A document containing nothing but comments is DomVariant::Empty even though it will have |
| // children in this first pass. We fix this in the next section. This allows us to use |
| // DomVariant::Empty to indicate "we haven't decided the type yet". |
| // 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document |
| // or DomVariant::DocumentFragment. DomVariant is only selected in this stage if we see a |
| // DOCTYPE tag. Comments do not change the type. |
| // 3) If the type is non-empty, we don't re-set the type. We do look for conflicts between |
| // the type and the tokens in the next stage. |
| for pair in pairs { |
| match pair.as_rule() { |
| // A <!DOCTYPE> tag means a full-fledged document. Note that because of the way |
| // the grammar is written, we will only get this token if the <!DOCTYPE> occurs |
| // before any other tag; otherwise it will be parsed as a custom tag. |
| Rule::doctype => { |
| if dom.tree_type == DomVariant::Empty { |
| dom.tree_type = DomVariant::Document; |
| } |
| } |
| |
| // If we see an element, build the sub-tree and add it as a child. If we don't |
| // have a document type yet (i.e. "empty"), select DocumentFragment |
| Rule::node_element => match Self::build_node_element(pair, &mut dom) { |
| Ok(el) => { |
| if let Some(node) = el { |
| if dom.tree_type == DomVariant::Empty { |
| dom.tree_type = DomVariant::DocumentFragment; |
| }; |
| dom.children.push(node); |
| } |
| } |
| Err(error) => { |
| dom.errors.push(format!("{}", error)); |
| } |
| }, |
| |
| // Similar to an element, we add it as a child and select DocumentFragment if we |
| // don't already have a document type. |
| Rule::node_text => { |
| if dom.tree_type == DomVariant::Empty { |
| dom.tree_type = DomVariant::DocumentFragment; |
| } |
| let text = pair.as_str().to_string(); |
| if !text.trim().is_empty() { |
| dom.children.push(Node::Text(text)); |
| } |
| } |
| |
| // Store comments as a child, but it doesn't affect the document type selection |
| // until the next phase (validation). |
| Rule::node_comment => { |
| dom.children |
| .push(Node::Comment(pair.into_inner().as_str().to_string())); |
| } |
| |
| // Ignore 'end of input', which then allows the catch-all unreachable!() arm to |
| // function properly. |
| Rule::EOI => (), |
| |
| // This should be unreachable, due to the way the grammar is written |
| _ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()), |
| }; |
| } |
| |
| // Implement some checks on the generated dom's data and initial type. The type may be |
| // modified in this section. |
| match dom.tree_type { |
| // A DomVariant::Empty can only have comments. Anything else is an error. |
| DomVariant::Empty => { |
| for node in &dom.children { |
| if let Node::Comment(_) = node { |
| // An "empty" document, but it has comments - this is where we cleanup the |
| // earlier assumption that a document with only comments is "empty". |
| // Really, it is a "fragment". |
| dom.tree_type = DomVariant::DocumentFragment |
| } else { |
| // Anything else (i.e. Text() or Element() ) can't happen at the top level; |
| // if we had seen one, we would have set the document type above |
| unreachable!("[build dom] empty document with an Element {:?}", node) |
| } |
| } |
| } |
| |
| // A DomVariant::Document can only have comments and an <HTML> node at the top level. |
| // Only one <HTML> tag is permitted. |
| DomVariant::Document => { |
| if dom |
| .children |
| .iter() |
| .filter(|x| match x { |
| Node::Element(el) if el.name.to_lowercase() == "html" => true, |
| _ => false, |
| }) |
| .count() |
| > 1 |
| { |
| return Err(Error::Parsing(format!("Document with multiple HTML tags",))); |
| } |
| } |
| |
| // A DomVariant::DocumentFragment should not have <HEAD>, or <BODY> tags at the |
| // top-level. If we find an <HTML> tag, then we consider this a Document instead (if |
| // it comes before any other elements, and if there is only one <HTML> tag). |
| DomVariant::DocumentFragment => { |
| let mut seen_html = false; |
| let mut seen_elements = false; |
| |
| for node in &dom.children { |
| match node { |
| // Nodes other than <HTML> - reject <HEAD> and <BODY> |
| Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => { |
| if el.name == "head" || el.name == "body" { |
| return Err(Error::Parsing(format!( |
| "A document fragment should not include {}", |
| el.name |
| ))); |
| } |
| seen_elements = true; |
| } |
| // <HTML> Nodes - one (before any other elements) is okay |
| Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => { |
| if seen_html || seen_elements { |
| return Err(Error::Parsing(format!( |
| "A document fragment should not include {}", |
| el.name |
| ))); |
| }; |
| |
| // A fragment with just an <HTML> tag is a document |
| dom.tree_type = DomVariant::Document; |
| seen_html = true; |
| } |
| // Comment() and Text() nodes are permitted at the top-level of a |
| // DocumentFragment |
| _ => (), |
| } |
| } |
| } |
| } |
| |
| // The result is the validated tree |
| Ok(dom) |
| } |
| |
| fn build_node_element(pair: Pair<Rule>, dom: &mut Dom) -> Result<Option<Node>> { |
| let source_span = { |
| let pair_span = pair.as_span(); |
| let (start_line, start_column) = pair_span.start_pos().line_col(); |
| let (end_line, end_column) = pair_span.end_pos().line_col(); |
| |
| SourceSpan::new( |
| String::from(pair_span.as_str()), |
| start_line, |
| end_line, |
| start_column, |
| end_column, |
| ) |
| }; |
| |
| let mut element = Element { |
| source_span, |
| ..Element::default() |
| }; |
| |
| for pair in pair.into_inner() { |
| match pair.as_rule() { |
| Rule::node_element | Rule::el_raw_text => { |
| match Self::build_node_element(pair, dom) { |
| Ok(el) => { |
| if let Some(child_element) = el { |
| element.children.push(child_element) |
| } |
| } |
| Err(error) => { |
| dom.errors.push(format!("{}", error)); |
| } |
| } |
| } |
| Rule::node_text | Rule::el_raw_text_content => { |
| let text = pair.as_str().to_string(); |
| if !text.trim().is_empty() { |
| element.children.push(Node::Text(text)); |
| } |
| } |
| Rule::node_comment => { |
| element |
| .children |
| .push(Node::Comment(pair.into_inner().as_str().to_string())); |
| } |
| // TODO: To enable some kind of validation we should probably align this with |
| // https://html.spec.whatwg.org/multipage/syntax.html#elements-2 |
| // Also see element variants |
| Rule::el_name | Rule::el_void_name | Rule::el_raw_text_name => { |
| element.name = pair.as_str().to_string(); |
| } |
| Rule::attr => match Self::build_attribute(pair.into_inner()) { |
| Ok((attr_key, attr_value)) => { |
| match attr_key.as_str() { |
| "id" => element.id = attr_value, |
| "class" => { |
| if let Some(classes) = attr_value { |
| let classes = classes.split_whitespace().collect::<Vec<_>>(); |
| for class in classes { |
| element.classes.push(class.to_string()); |
| } |
| } |
| } |
| _ => { |
| element.attributes.insert(attr_key, attr_value); |
| } |
| }; |
| } |
| Err(error) => { |
| dom.errors.push(format!("{}", error)); |
| } |
| }, |
| Rule::el_normal_end | Rule::el_raw_text_end => { |
| element.variant = ElementVariant::Normal; |
| break; |
| } |
| Rule::el_dangling => (), |
| Rule::EOI => (), |
| _ => { |
| return Err(Error::Parsing(format!( |
| "Failed to create element at rule: {:?}", |
| pair.as_rule() |
| ))) |
| } |
| } |
| } |
| if element.name != "" { |
| Ok(Some(Node::Element(element))) |
| } else { |
| Ok(None) |
| } |
| } |
| |
| fn build_attribute(pairs: Pairs<Rule>) -> Result<(String, Option<String>)> { |
| let mut attribute = ("".to_string(), None); |
| for pair in pairs { |
| match pair.as_rule() { |
| Rule::attr_key => { |
| attribute.0 = pair.as_str().trim().to_string(); |
| } |
| Rule::attr_non_quoted => { |
| attribute.1 = Some(pair.as_str().trim().to_string()); |
| } |
| Rule::attr_quoted => { |
| let inner_pair = pair |
| .into_inner() |
| .into_iter() |
| .next() |
| .expect("attribute value"); |
| |
| match inner_pair.as_rule() { |
| Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()), |
| _ => { |
| return Err(Error::Parsing(format!( |
| "Failed to parse attr value: {:?}", |
| inner_pair.as_rule() |
| ))) |
| } |
| } |
| } |
| _ => { |
| return Err(Error::Parsing(format!( |
| "Failed to parse attr: {:?}", |
| pair.as_rule() |
| ))) |
| } |
| } |
| } |
| Ok(attribute) |
| } |
| } |