blob: 686d0d488619707fe4d83e9664271d662d6737f7 [file] [log] [blame]
//! Contains parser configuration structure.
use std::collections::HashMap;
use std::io::Read;
use crate::reader::EventReader;
use crate::util::Encoding;
/// Limits to defend from billion laughs attack
const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
///
/// This structure contains various configuration options which affect
/// behavior of the parser.
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct ParserConfig {
/// Whether or not should whitespace in textual events be removed. Default is false.
///
/// When true, all standalone whitespace will be removed (this means no
/// `Whitespace` events will be emitted), and leading and trailing whitespace
/// from `Character` events will be deleted. If after trimming `Characters`
/// event will be empty, it will also be omitted from output stream. This is
/// possible, however, only if `whitespace_to_characters` or
/// `cdata_to_characters` options are set.
///
/// This option does not affect CDATA events, unless `cdata_to_characters`
/// option is also set. In that case CDATA content will also be trimmed.
pub trim_whitespace: bool,
/// Whether or not should whitespace be converted to characters.
/// Default is false.
///
/// If true, instead of `Whitespace` events `Characters` events with the
/// same content will be emitted. If `trim_whitespace` is also true, these
/// events will be trimmed to nothing and, consequently, not emitted.
pub whitespace_to_characters: bool,
/// Whether or not should CDATA be converted to characters.
/// Default is false.
///
/// If true, instead of `CData` events `Characters` events with the same
/// content will be emitted. If `trim_whitespace` is also true, these events
/// will be trimmed. If corresponding CDATA contained nothing but whitespace,
/// this event will be omitted from the stream.
pub cdata_to_characters: bool,
/// Whether or not should comments be omitted. Default is true.
///
/// If true, `Comment` events will not be emitted at all.
pub ignore_comments: bool,
/// Whether or not should sequential `Characters` events be merged.
/// Default is true.
///
/// If true, multiple sequential `Characters` events will be merged into
/// a single event, that is, their data will be concatenated.
///
/// Multiple sequential `Characters` events are only possible if either
/// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
/// events will always be separated by other events.
pub coalesce_characters: bool,
/// A map of extra entities recognized by the parser. Default is an empty map.
///
/// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
/// however, it is convenient to make the parser recognize additional entities which
/// are also not available through the DTD definitions (especially given that at the moment
/// DTD parsing is not supported).
pub extra_entities: HashMap<String, String>,
/// Whether or not the parser should ignore the end of stream. Default is false.
///
/// By default the parser will either error out when it encounters a premature end of
/// stream or complete normally if the end of stream was expected. If you want to continue
/// reading from a stream whose input is supplied progressively, you can set this option to true.
/// In this case the parser will allow you to invoke the next() method even if a supposed end
/// of stream has happened.
///
/// Note that support for this functionality is incomplete; for example, the parser will fail if
/// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
pub ignore_end_of_stream: bool,
/// Whether or not non-unicode entity references get replaced with the replacement character
///
/// When true, any decimal or hexadecimal character reference that cannot be converted from a
/// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
/// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
pub replace_unknown_entity_references: bool,
/// Whether or not whitespace at the root level of the document is ignored. Default is true.
///
/// By default any whitespace that is not enclosed within at least one level of elements will be
/// ignored. Setting this value to false will cause root level whitespace events to be emitted.
///
/// **There are configuration options – see methods below**
pub ignore_root_level_whitespace: bool,
}
impl ParserConfig {
/// Returns a new config with default values.
///
/// You can tweak default values using builder-like pattern:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let config = ParserConfig::new()
/// .trim_whitespace(true)
/// .ignore_comments(true)
/// .coalesce_characters(false);
/// ```
#[must_use]
#[inline]
pub fn new() -> ParserConfig {
ParserConfig {
trim_whitespace: false,
whitespace_to_characters: false,
cdata_to_characters: false,
ignore_comments: true,
coalesce_characters: true,
extra_entities: HashMap::new(),
ignore_end_of_stream: false,
replace_unknown_entity_references: false,
ignore_root_level_whitespace: true,
}
}
/// Creates an XML reader with this configuration.
///
/// This is a convenience method for configuring and creating a reader at the same time:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let mut source: &[u8] = b"...";
///
/// let reader = ParserConfig::new()
/// .trim_whitespace(true)
/// .ignore_comments(true)
/// .coalesce_characters(false)
/// .create_reader(&mut source);
/// ```
///
/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
/// this configuration object.
#[inline]
pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
EventReader::new_with_config(source, self)
}
/// Adds a new entity mapping and returns an updated config object.
///
/// This is a convenience method for adding external entities mappings to the XML parser.
/// An example:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let mut source: &[u8] = b"...";
///
/// let reader = ParserConfig::new()
/// .add_entity("nbsp", " ")
/// .add_entity("copy", "©")
/// .add_entity("reg", "®")
/// .create_reader(&mut source);
/// ```
pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
self.extra_entities.insert(entity.into(), value.into());
self
}
}
impl Default for ParserConfig {
#[inline]
fn default() -> ParserConfig {
ParserConfig::new()
}
}
gen_setters! { ParserConfig,
trim_whitespace: val bool,
whitespace_to_characters: val bool,
cdata_to_characters: val bool,
ignore_comments: val bool,
coalesce_characters: val bool,
ignore_end_of_stream: val bool,
replace_unknown_entity_references: val bool,
ignore_root_level_whitespace: val bool
}
/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
#[derive(Clone, PartialEq, Eq, Debug)]
#[non_exhaustive]
pub struct ParserConfig2 {
pub(crate) c: ParserConfig,
/// Use this encoding as the default. Necessary for UTF-16 files without BOM.
pub override_encoding: Option<Encoding>,
/// Allow `<?xml encoding="…">` to contain unsupported encoding names,
/// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
pub ignore_invalid_encoding_declarations: bool,
/// Documents with multiple root elements are ill-formed
pub allow_multiple_root_elements: bool,
/// Abort if custom entities create a string longer than this
pub max_entity_expansion_length: usize,
/// Entities can expand into other entities this many times (be careful about exponential cost!)
pub max_entity_expansion_depth: u8,
/// Maximum length of tag name or attribute name
pub max_name_length: usize,
/// Max number of attributes per element
pub max_attributes: usize,
/// Max number of bytes in each attribute
pub max_attribute_length: usize,
/// Maximum length of strings reprsenting characters, comments, and processing instructions
pub max_data_length: usize,
}
impl Default for ParserConfig2 {
fn default() -> Self {
ParserConfig2 {
c: Default::default(),
override_encoding: None,
ignore_invalid_encoding_declarations: false,
allow_multiple_root_elements: true,
max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
max_attributes: 1<<16,
max_attribute_length: 1<<30,
max_data_length: 1<<30,
max_name_length: 1<<18,
}
}
}
impl ParserConfig2 {
#[inline]
#[must_use]
pub fn new() -> Self {
Self::default()
}
/// Read character encoding from `Content-Type` header.
/// Set this when parsing XML documents fetched over HTTP.
///
/// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
#[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
let charset = mime_type.split_once(';')
.and_then(|(_, args)| args.split_once("charset"))
.and_then(|(_, args)| args.split_once('='));
if let Some((_, charset)) = charset {
let name = charset.trim().trim_matches('"');
match name.parse() {
Ok(enc) => {
self.override_encoding = Some(enc);
},
Err(_) => {},
}
}
self
}
/// Creates an XML reader with this configuration.
///
/// This is a convenience method for configuring and creating a reader at the same time:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let mut source: &[u8] = b"...";
///
/// let reader = ParserConfig::new()
/// .trim_whitespace(true)
/// .ignore_comments(true)
/// .coalesce_characters(false)
/// .create_reader(&mut source);
/// ```
///
/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
/// this configuration object.
#[inline]
pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
EventReader::new_with_config(source, self)
}
}
impl From<ParserConfig> for ParserConfig2 {
#[inline]
fn from(c: ParserConfig) -> Self {
Self {
c,
..Default::default()
}
}
}
gen_setters! { ParserConfig2,
/// Set if you got one in the HTTP header
override_encoding: val Option<Encoding>,
/// Allows invalid documents. There should be only a single root element in XML.
allow_multiple_root_elements: val bool,
/// Abort if custom entities create a string longer than this
max_entity_expansion_length: val usize,
/// Entities can expand into other entities this many times (be careful about exponential cost!)
max_entity_expansion_depth: val u8,
/// Max number of attributes per element
max_attributes: val usize,
/// Maximum length of tag name or attribute name
max_name_length: val usize,
/// Max number of bytes in each attribute
max_attribute_length: val usize,
/// Maximum length of strings reprsenting characters, comments, and processing instructions
max_data_length: val usize,
/// Allow `<?xml encoding="bogus"?>`
ignore_invalid_encoding_declarations: val bool
}
gen_setters! { ParserConfig,
/// Set if you got one in the HTTP header (see `content_type`)
override_encoding: c2 Option<Encoding>,
/// Allow `<?xml encoding="bogus"?>`
ignore_invalid_encoding_declarations: c2 bool,
/// Allows invalid documents. There should be only a single root element in XML.
allow_multiple_root_elements: c2 bool,
/// Abort if custom entities create a string longer than this
max_entity_expansion_length: c2 usize,
/// Entities can expand into other entities this many times (be careful about exponential cost!)
max_entity_expansion_depth: c2 u8,
/// Max number of attributes per element
max_attributes: c2 usize,
/// Maximum length of tag name or attribute name
max_name_length: c2 usize,
/// Max number of bytes in each attribute
max_attribute_length: c2 usize,
/// Maximum length of strings reprsenting characters, comments, and processing instructions
max_data_length: c2 usize,
/// Set encoding from the MIME type. Important for HTTP compatibility.
content_type: c2 &str
}
gen_setters! { ParserConfig2,
trim_whitespace: delegate bool,
whitespace_to_characters: delegate bool,
cdata_to_characters: delegate bool,
ignore_comments: delegate bool,
coalesce_characters: delegate bool,
ignore_end_of_stream: delegate bool,
replace_unknown_entity_references: delegate bool,
/// Whether or not whitespace at the root level of the document is ignored. Default is true.
ignore_root_level_whitespace: delegate bool
}
#[test]
fn mime_parse() {
let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
assert_eq!(c.override_encoding, Some(Encoding::Ascii));
let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
assert_eq!(c.override_encoding, Some(Encoding::Utf16));
}