vendor/html_parser-0.7.0/src/dom/mod.rs - toolchain/rustc - Git at Google

 use crate::Result;
 use pest::{iterators::Pair, iterators::Pairs, Parser};
 use serde::Serialize;
 use std::default::Default;

 use crate::error::Error;
 use crate::grammar::Grammar;
 use crate::Rule;

 pub mod element;
 pub mod formatting;
 pub mod node;
 pub mod span;

 use crate::dom::span::SourceSpan;
 use element::{Element, ElementVariant};
 use node::Node;

 /// Document, DocumentFragment or Empty
 #[derive(Debug, Clone, PartialEq, Serialize)]
 #[serde(rename_all = "camelCase")]
 pub enum DomVariant {
     /// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html.
     /// Example:
     /// ```text
     /// <!doctype html>
     /// <html>
     ///     <head></head>
     ///     <body>
     ///         <h1>Hello world</h1>
     ///     </body>
     /// </html>
     /// ```
     Document,
     /// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head.
     /// Example:
     /// ```text
     /// <h1>Hello world</h1>
     /// ```
     DocumentFragment,
     /// An empty dom means that the input was empty
     Empty,
 }

 /// **The main struct** & the result of the parsed html
 #[derive(Debug, Clone, Serialize, PartialEq)]
 #[serde(rename_all = "camelCase")]
 pub struct Dom {
     /// The type of the tree that was parsed
     pub tree_type: DomVariant,

     /// All of the root children in the tree
     #[serde(skip_serializing_if = "Vec::is_empty")]
     pub children: Vec<Node>,

     /// A collection of all errors during parsing
     #[serde(skip_serializing)]
     pub errors: Vec<String>,
 }

 impl Default for Dom {
     fn default() -> Self {
         Self {
             tree_type: DomVariant::Empty,
             children: vec![],
             errors: vec![],
         }
     }
 }

 impl Dom {
     pub fn parse(input: &str) -> Result<Self> {
         let pairs = match Grammar::parse(Rule::html, input) {
             Ok(pairs) => pairs,
             Err(error) => return formatting::error_msg(error),
         };
         Self::build_dom(pairs)
     }

     pub fn to_json(&self) -> Result<String> {
         Ok(serde_json::to_string(self)?)
     }

     pub fn to_json_pretty(&self) -> Result<String> {
         Ok(serde_json::to_string_pretty(self)?)
     }

     fn build_dom(pairs: Pairs<Rule>) -> Result<Self> {
         let mut dom = Self::default();

         // NOTE: The logic is roughly as follows:
         // 1) A document containing nothing but comments is DomVariant::Empty even though it will have
         //    children in this first pass.  We fix this in the next section.  This allows us to use
         //    DomVariant::Empty to indicate "we haven't decided the type yet".
         // 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document
         //    or DomVariant::DocumentFragment.  DomVariant is only selected in this stage if we see a
         //    DOCTYPE tag.  Comments do not change the type.
         // 3) If the type is non-empty, we don't re-set the type.  We do look for conflicts between
         //    the type and the tokens in the next stage.
         for pair in pairs {
             match pair.as_rule() {
                 // A <!DOCTYPE> tag means a full-fledged document.  Note that because of the way
                 // the grammar is written, we will only get this token if the <!DOCTYPE> occurs
                 // before any other tag; otherwise it will be parsed as a custom tag.
                 Rule::doctype => {
                     if dom.tree_type == DomVariant::Empty {
                         dom.tree_type = DomVariant::Document;
                     }
                 }

                 // If we see an element, build the sub-tree and add it as a child.  If we don't
                 // have a document type yet (i.e. "empty"), select DocumentFragment
                 Rule::node_element => match Self::build_node_element(pair, &mut dom) {
                     Ok(el) => {
                         if let Some(node) = el {
                             if dom.tree_type == DomVariant::Empty {
                                 dom.tree_type = DomVariant::DocumentFragment;
                             };
                             dom.children.push(node);
                         }
                     }
                     Err(error) => {
                         dom.errors.push(format!("{}", error));
                     }
                 },

                 // Similar to an element, we add it as a child and select DocumentFragment if we
                 // don't already have a document type.
                 Rule::node_text => {
                     if dom.tree_type == DomVariant::Empty {
                         dom.tree_type = DomVariant::DocumentFragment;
                     }
                     let text = pair.as_str().to_string();
                     if !text.trim().is_empty() {
                         dom.children.push(Node::Text(text));
                     }
                 }

                 // Store comments as a child, but it doesn't affect the document type selection
                 // until the next phase (validation).
                 Rule::node_comment => {
                     dom.children
                         .push(Node::Comment(pair.into_inner().as_str().to_string()));
                 }

                 // Ignore 'end of input', which then allows the catch-all unreachable!() arm to
                 // function properly.
                 Rule::EOI => (),

                 // This should be unreachable, due to the way the grammar is written
                 _ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()),
             };
         }

         // Implement some checks on the generated dom's data and initial type.  The type may be
         // modified in this section.
         match dom.tree_type {
             // A DomVariant::Empty can only have comments. Anything else is an error.
             DomVariant::Empty => {
                 for node in &dom.children {
                     if let Node::Comment(_) = node {
                         // An "empty" document, but it has comments - this is where we cleanup the
                         // earlier assumption that a document with only comments is "empty".
                         // Really, it is a "fragment".
                         dom.tree_type = DomVariant::DocumentFragment
                     } else {
                         // Anything else (i.e. Text() or Element() ) can't happen at the top level;
                         // if we had seen one, we would have set the document type above
                         unreachable!("[build dom] empty document with an Element {:?}", node)
                     }
                 }
             }

             // A DomVariant::Document can only have comments and an <HTML> node at the top level.
             // Only one <HTML> tag is permitted.
             DomVariant::Document => {
                 if dom
                     .children
                     .iter()
                     .filter(|x| match x {
                         Node::Element(el) if el.name.to_lowercase() == "html" => true,
                         _ => false,
                     })
                     .count()
                     > 1
                 {
                     return Err(Error::Parsing(format!("Document with multiple HTML tags",)));
                 }
             }

             // A DomVariant::DocumentFragment should not have <HEAD>, or <BODY> tags at the
             // top-level.  If we find an <HTML> tag, then we consider this a Document instead (if
             // it comes before any other elements, and if there is only one <HTML> tag).
             DomVariant::DocumentFragment => {
                 let mut seen_html = false;
                 let mut seen_elements = false;

                 for node in &dom.children {
                     match node {
                         // Nodes other than <HTML> - reject <HEAD> and <BODY>
                         Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => {
                             if el.name == "head" || el.name == "body" {
                                 return Err(Error::Parsing(format!(
                                     "A document fragment should not include {}",
                                     el.name
                                 )));
                             }
                             seen_elements = true;
                         }
                         // <HTML> Nodes - one (before any other elements) is okay
                         Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => {
                             if seen_html || seen_elements {
                                 return Err(Error::Parsing(format!(
                                     "A document fragment should not include {}",
                                     el.name
                                 )));
                             };

                             // A fragment with just an <HTML> tag is a document
                             dom.tree_type = DomVariant::Document;
                             seen_html = true;
                         }
                         // Comment() and Text() nodes are permitted at the top-level of a
                         // DocumentFragment
                         _ => (),
                     }
                 }
             }
         }

         // The result is the validated tree
         Ok(dom)
     }

     fn build_node_element(pair: Pair<Rule>, dom: &mut Dom) -> Result<Option<Node>> {
         let source_span = {
             let pair_span = pair.as_span();
             let (start_line, start_column) = pair_span.start_pos().line_col();
             let (end_line, end_column) = pair_span.end_pos().line_col();

             SourceSpan::new(
                 String::from(pair_span.as_str()),
                 start_line,
                 end_line,
                 start_column,
                 end_column,
             )
         };

         let mut element = Element {
             source_span,
             ..Element::default()
         };

         for pair in pair.into_inner() {
             match pair.as_rule() {
                 Rule::node_element | Rule::el_raw_text => {
                     match Self::build_node_element(pair, dom) {
                         Ok(el) => {
                             if let Some(child_element) = el {
                                 element.children.push(child_element)
                             }
                         }
                         Err(error) => {
                             dom.errors.push(format!("{}", error));
                         }
                     }
                 }
                 Rule::node_text | Rule::el_raw_text_content => {
                     let text = pair.as_str().to_string();
                     if !text.trim().is_empty() {
                         element.children.push(Node::Text(text));
                     }
                 }
                 Rule::node_comment => {
                     element
                         .children
                         .push(Node::Comment(pair.into_inner().as_str().to_string()));
                 }
                 // TODO: To enable some kind of validation we should probably align this with
                 // https://html.spec.whatwg.org/multipage/syntax.html#elements-2
                 // Also see element variants
                 Rule::el_name | Rule::el_void_name | Rule::el_raw_text_name => {
                     element.name = pair.as_str().to_string();
                 }
                 Rule::attr => match Self::build_attribute(pair.into_inner()) {
                     Ok((attr_key, attr_value)) => {
                         match attr_key.as_str() {
                             "id" => element.id = attr_value,
                             "class" => {
                                 if let Some(classes) = attr_value {
                                     let classes = classes.split_whitespace().collect::<Vec<_>>();
                                     for class in classes {
                                         element.classes.push(class.to_string());
                                     }
                                 }
                             }
                             _ => {
                                 element.attributes.insert(attr_key, attr_value);
                             }
                         };
                     }
                     Err(error) => {
                         dom.errors.push(format!("{}", error));
                     }
                 },
                 Rule::el_normal_end | Rule::el_raw_text_end => {
                     element.variant = ElementVariant::Normal;
                     break;
                 }
                 Rule::el_dangling => (),
                 Rule::EOI => (),
                 _ => {
                     return Err(Error::Parsing(format!(
                         "Failed to create element at rule: {:?}",
                         pair.as_rule()
                     )))
                 }
             }
         }
         if element.name != "" {
             Ok(Some(Node::Element(element)))
         } else {
             Ok(None)
         }
     }

     fn build_attribute(pairs: Pairs<Rule>) -> Result<(String, Option<String>)> {
         let mut attribute = ("".to_string(), None);
         for pair in pairs {
             match pair.as_rule() {
                 Rule::attr_key => {
                     attribute.0 = pair.as_str().trim().to_string();
                 }
                 Rule::attr_non_quoted => {
                     attribute.1 = Some(pair.as_str().trim().to_string());
                 }
                 Rule::attr_quoted => {
                     let inner_pair = pair
                         .into_inner()
                         .into_iter()
                         .next()
                         .expect("attribute value");

                     match inner_pair.as_rule() {
                         Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()),
                         _ => {
                             return Err(Error::Parsing(format!(
                                 "Failed to parse attr value: {:?}",
                                 inner_pair.as_rule()
                             )))
                         }
                     }
                 }
                 _ => {
                     return Err(Error::Parsing(format!(
                         "Failed to parse attr: {:?}",
                         pair.as_rule()
                     )))
                 }
             }
         }
         Ok(attribute)
     }
 }
	use crate::Result;
	use pest::{iterators::Pair, iterators::Pairs, Parser};
	use serde::Serialize;
	use std::default::Default;

	use crate::error::Error;
	use crate::grammar::Grammar;
	use crate::Rule;

	pub mod element;
	pub mod formatting;
	pub mod node;
	pub mod span;

	use crate::dom::span::SourceSpan;
	use element::{Element, ElementVariant};
	use node::Node;

	/// Document, DocumentFragment or Empty
	#[derive(Debug, Clone, PartialEq, Serialize)]
	#[serde(rename_all = "camelCase")]
	pub enum DomVariant {
	/// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html.
	/// Example:
	/// ```text
	/// <!doctype html>
	/// <html>
	/// <head></head>
	/// <body>
	/// <h1>Hello world</h1>
	/// </body>
	/// </html>
	/// ```
	Document,
	/// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head.
	/// Example:
	/// ```text
	/// <h1>Hello world</h1>
	/// ```
	DocumentFragment,
	/// An empty dom means that the input was empty
	Empty,
	}

	/// The main struct & the result of the parsed html
	#[derive(Debug, Clone, Serialize, PartialEq)]
	#[serde(rename_all = "camelCase")]
	pub struct Dom {
	/// The type of the tree that was parsed
	pub tree_type: DomVariant,

	/// All of the root children in the tree
	#[serde(skip_serializing_if = "Vec::is_empty")]
	pub children: Vec<Node>,

	/// A collection of all errors during parsing
	#[serde(skip_serializing)]
	pub errors: Vec<String>,
	}

	impl Default for Dom {
	fn default() -> Self {
	Self {
	tree_type: DomVariant::Empty,
	children: vec![],
	errors: vec![],
	}
	}
	}

	impl Dom {
	pub fn parse(input: &str) -> Result<Self> {
	let pairs = match Grammar::parse(Rule::html, input) {
	Ok(pairs) => pairs,
	Err(error) => return formatting::error_msg(error),
	};
	Self::build_dom(pairs)
	}

	pub fn to_json(&self) -> Result<String> {
	Ok(serde_json::to_string(self)?)
	}

	pub fn to_json_pretty(&self) -> Result<String> {
	Ok(serde_json::to_string_pretty(self)?)
	}

	fn build_dom(pairs: Pairs<Rule>) -> Result<Self> {
	let mut dom = Self::default();

	// NOTE: The logic is roughly as follows:
	// 1) A document containing nothing but comments is DomVariant::Empty even though it will have
	// children in this first pass. We fix this in the next section. This allows us to use
	// DomVariant::Empty to indicate "we haven't decided the type yet".
	// 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document
	// or DomVariant::DocumentFragment. DomVariant is only selected in this stage if we see a
	// DOCTYPE tag. Comments do not change the type.
	// 3) If the type is non-empty, we don't re-set the type. We do look for conflicts between
	// the type and the tokens in the next stage.
	for pair in pairs {
	match pair.as_rule() {
	// A <!DOCTYPE> tag means a full-fledged document. Note that because of the way
	// the grammar is written, we will only get this token if the <!DOCTYPE> occurs
	// before any other tag; otherwise it will be parsed as a custom tag.
	Rule::doctype => {
	if dom.tree_type == DomVariant::Empty {
	dom.tree_type = DomVariant::Document;
	}
	}

	// If we see an element, build the sub-tree and add it as a child. If we don't
	// have a document type yet (i.e. "empty"), select DocumentFragment
	Rule::node_element => match Self::build_node_element(pair, &mut dom) {
	Ok(el) => {
	if let Some(node) = el {
	if dom.tree_type == DomVariant::Empty {
	dom.tree_type = DomVariant::DocumentFragment;
	};
	dom.children.push(node);
	}
	}
	Err(error) => {
	dom.errors.push(format!("{}", error));
	}
	},

	// Similar to an element, we add it as a child and select DocumentFragment if we
	// don't already have a document type.
	Rule::node_text => {
	if dom.tree_type == DomVariant::Empty {
	dom.tree_type = DomVariant::DocumentFragment;
	}
	let text = pair.as_str().to_string();
	if !text.trim().is_empty() {
	dom.children.push(Node::Text(text));
	}
	}

	// Store comments as a child, but it doesn't affect the document type selection
	// until the next phase (validation).
	Rule::node_comment => {
	dom.children
	.push(Node::Comment(pair.into_inner().as_str().to_string()));
	}

	// Ignore 'end of input', which then allows the catch-all unreachable!() arm to
	// function properly.
	Rule::EOI => (),

	// This should be unreachable, due to the way the grammar is written
	_ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()),
	};
	}

	// Implement some checks on the generated dom's data and initial type. The type may be
	// modified in this section.
	match dom.tree_type {
	// A DomVariant::Empty can only have comments. Anything else is an error.
	DomVariant::Empty => {
	for node in &dom.children {
	if let Node::Comment(_) = node {
	// An "empty" document, but it has comments - this is where we cleanup the
	// earlier assumption that a document with only comments is "empty".
	// Really, it is a "fragment".
	dom.tree_type = DomVariant::DocumentFragment
	} else {
	// Anything else (i.e. Text() or Element() ) can't happen at the top level;
	// if we had seen one, we would have set the document type above
	unreachable!("[build dom] empty document with an Element {:?}", node)
	}
	}
	}

	// A DomVariant::Document can only have comments and an <HTML> node at the top level.
	// Only one <HTML> tag is permitted.
	DomVariant::Document => {
	if dom
	.children
	.iter()
	.filter(\|x\| match x {
	Node::Element(el) if el.name.to_lowercase() == "html" => true,
	_ => false,
	})
	.count()
	> 1
	{
	return Err(Error::Parsing(format!("Document with multiple HTML tags",)));
	}
	}

	// A DomVariant::DocumentFragment should not have <HEAD>, or <BODY> tags at the
	// top-level. If we find an <HTML> tag, then we consider this a Document instead (if
	// it comes before any other elements, and if there is only one <HTML> tag).
	DomVariant::DocumentFragment => {
	let mut seen_html = false;
	let mut seen_elements = false;

	for node in &dom.children {
	match node {
	// Nodes other than <HTML> - reject <HEAD> and <BODY>
	Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => {
	if el.name == "head" \|\| el.name == "body" {
	return Err(Error::Parsing(format!(
	"A document fragment should not include {}",
	el.name
	)));
	}
	seen_elements = true;
	}
	// <HTML> Nodes - one (before any other elements) is okay
	Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => {
	if seen_html \|\| seen_elements {
	return Err(Error::Parsing(format!(
	"A document fragment should not include {}",
	el.name
	)));
	};

	// A fragment with just an <HTML> tag is a document
	dom.tree_type = DomVariant::Document;
	seen_html = true;
	}
	// Comment() and Text() nodes are permitted at the top-level of a
	// DocumentFragment
	_ => (),
	}
	}
	}
	}

	// The result is the validated tree
	Ok(dom)
	}

	fn build_node_element(pair: Pair<Rule>, dom: &mut Dom) -> Result<Option<Node>> {
	let source_span = {
	let pair_span = pair.as_span();
	let (start_line, start_column) = pair_span.start_pos().line_col();
	let (end_line, end_column) = pair_span.end_pos().line_col();

	SourceSpan::new(
	String::from(pair_span.as_str()),
	start_line,
	end_line,
	start_column,
	end_column,
	)
	};

	let mut element = Element {
	source_span,
	..Element::default()
	};

	for pair in pair.into_inner() {
	match pair.as_rule() {
	Rule::node_element \| Rule::el_raw_text => {
	match Self::build_node_element(pair, dom) {
	Ok(el) => {
	if let Some(child_element) = el {
	element.children.push(child_element)
	}
	}
	Err(error) => {
	dom.errors.push(format!("{}", error));
	}
	}
	}
	Rule::node_text \| Rule::el_raw_text_content => {
	let text = pair.as_str().to_string();
	if !text.trim().is_empty() {
	element.children.push(Node::Text(text));
	}
	}
	Rule::node_comment => {
	element
	.children
	.push(Node::Comment(pair.into_inner().as_str().to_string()));
	}
	// TODO: To enable some kind of validation we should probably align this with
	// https://html.spec.whatwg.org/multipage/syntax.html#elements-2
	// Also see element variants
	Rule::el_name \| Rule::el_void_name \| Rule::el_raw_text_name => {
	element.name = pair.as_str().to_string();
	}
	Rule::attr => match Self::build_attribute(pair.into_inner()) {
	Ok((attr_key, attr_value)) => {
	match attr_key.as_str() {
	"id" => element.id = attr_value,
	"class" => {
	if let Some(classes) = attr_value {
	let classes = classes.split_whitespace().collect::<Vec<_>>();
	for class in classes {
	element.classes.push(class.to_string());
	}
	}
	}
	_ => {
	element.attributes.insert(attr_key, attr_value);
	}
	};
	}
	Err(error) => {
	dom.errors.push(format!("{}", error));
	}
	},
	Rule::el_normal_end \| Rule::el_raw_text_end => {
	element.variant = ElementVariant::Normal;
	break;
	}
	Rule::el_dangling => (),
	Rule::EOI => (),
	_ => {
	return Err(Error::Parsing(format!(
	"Failed to create element at rule: {:?}",
	pair.as_rule()
	)))
	}
	}
	}
	if element.name != "" {
	Ok(Some(Node::Element(element)))
	} else {
	Ok(None)
	}
	}

	fn build_attribute(pairs: Pairs<Rule>) -> Result<(String, Option<String>)> {
	let mut attribute = ("".to_string(), None);
	for pair in pairs {
	match pair.as_rule() {
	Rule::attr_key => {
	attribute.0 = pair.as_str().trim().to_string();
	}
	Rule::attr_non_quoted => {
	attribute.1 = Some(pair.as_str().trim().to_string());
	}
	Rule::attr_quoted => {
	let inner_pair = pair
	.into_inner()
	.into_iter()
	.next()
	.expect("attribute value");

	match inner_pair.as_rule() {
	Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()),
	_ => {
	return Err(Error::Parsing(format!(
	"Failed to parse attr value: {:?}",
	inner_pair.as_rule()
	)))
	}
	}
	}
	_ => {
	return Err(Error::Parsing(format!(
	"Failed to parse attr: {:?}",
	pair.as_rule()
	)))
	}
	}
	}
	Ok(attribute)
	}
	}