blob: 0e285499c336ede40372314cdcaf53fbd0e10c97 [file] [log] [blame] [edit]
//! Parsers for each part of TOML - keys, values, and arrays.
//!
//! Parser rules:
//! 1. Each parser is only responsible for the length of the data it parses. Extraneous whitespace,
//! comments, or invalid characters fall outside the scope of the parsers.
//! 2. Parsers assume that the current index in the [`Text`] is the first character of what they
//! should parse - ie, the first letter of a key, opening quote of a quoted key, opening bracket
//! of a table, etc.
//! 3. Each parser should leave `text.idx` at the last byte it parsed.
use {crate::crate_prelude::*, std::num::IntErrorKind};
/// Parses a `<key> = <value>` assignment.
pub fn parse_assignment<'a>(text: &mut Text<'a>) -> Result<(Key<'a>, TomlValue<'a>), Error> {
let key = parse_key(text)?;
text.idx += 1;
text.skip_whitespace();
if text.current_byte() != Some(b'=') {
return Err(Error {
start: key.text.span().start,
end: text.idx,
kind: ErrorKind::NoEqualsInAssignment,
});
}
text.idx += 1;
text.skip_whitespace();
if text.idx >= text.end() {
return Err(Error {
start: key.text.span().start,
end: text.idx,
kind: ErrorKind::NoValueInAssignment,
});
}
let value = parse_value(text)?;
Ok((key, value))
}
/// Parses a key. Supports quoted, dotted, and bare keys.
pub fn parse_key<'a>(text: &mut Text<'a>) -> Result<Key<'a>, Error> {
let maybe_key = match text.current_byte().unwrap() {
b'\'' | b'"' => parse_string(text)?,
_ => {
let start = text.idx;
let mut current = text.idx;
while let Some(byte) = text.byte(current) {
if !byte.is_ascii_alphanumeric() && byte != b'-' && byte != b'_' {
break;
}
current += 1;
}
if text.byte(current).is_none() {
// Text shouldn't end on a key definition
return Err(Error {
start,
end: current,
kind: ErrorKind::NoValueInAssignment,
});
}
if start == current {
// Empty bare keys are not allowed
return Err(Error {
start,
end: current,
kind: ErrorKind::InvalidBareKey,
});
}
let span = text.excerpt(start..current);
text.idx = current - 1;
CowSpan::Raw(span)
}
};
// Check for dotted key
let key_end = text.idx;
text.idx += 1;
text.skip_whitespace();
if text.current_byte() == Some(b'.') {
text.idx += 1;
text.skip_whitespace();
Ok(Key {
text: maybe_key,
child: Some(Box::new(parse_key(text)?)),
})
} else {
text.idx = key_end;
Ok(Key {
text: maybe_key,
child: None,
})
}
}
/// Parses a value. Supports all of the non-time-related value types.
pub fn parse_value<'a>(text: &mut Text<'a>) -> Result<TomlValue<'a>, Error> {
match text.current_byte().unwrap() {
// Integer, time, or float
b'0'..=b'9' | b'i' | b'n' => parse_num(text, false),
// Integer or float with +/- modifier
b'+' if text.remaining_bytes() > 0 => {
text.idx += 1;
parse_num(text, false)
}
b'-' if text.remaining_bytes() > 0 => {
text.idx += 1;
parse_num(text, true)
}
// String
b'\'' | b'"' => parse_string(text).map(TomlValue::String),
// Bool
b't' | b'f' if text.remaining_bytes() >= 3 => {
let span = text.excerpt(text.idx..text.idx + 4);
if span.as_str() == "true" {
text.idx = span.end;
return Ok(TomlValue::Boolean(true));
} else if span.as_str() == "fals" && text.byte(text.idx + 4) == Some(b'e') {
text.idx = span.end + 1;
return Ok(TomlValue::Boolean(false));
}
let span = text.excerpt(text.idx..);
Err(Error {
start: span.start,
end: span.find_next_whitespace_or_newline().unwrap_or(text.end()),
kind: ErrorKind::UnrecognisedValue,
})
}
// Array
b'[' => {
if text.remaining_bytes() == 0 {
return Err(Error {
start: text.idx,
end: text.idx,
kind: ErrorKind::UnclosedBracket,
});
}
let mut array = Vec::new();
let mut span = text.excerpt(text.idx..);
let mut seen_comma = true;
text.idx += 1;
loop {
text.skip_whitespace_and_newlines();
match text.current_byte() {
Some(b']') => break,
Some(b',') => {
text.idx += 1;
text.skip_whitespace_and_newlines();
if text.remaining_bytes() == 0 {
return Err(Error {
start: span.start,
end: text.idx,
kind: ErrorKind::UnclosedBracket,
});
}
seen_comma = true;
continue;
}
Some(b'#') => {
text.idx = text.excerpt(text.idx..).find(b'\n').unwrap_or(text.end());
text.skip_whitespace_and_newlines();
continue;
}
Some(_) if !seen_comma => {
return Err(Error {
start: text.idx,
end: text.idx,
kind: ErrorKind::NoCommaDelimeter,
})
}
Some(_) => {}
None => {
return Err(Error {
start: span.start,
end: text.idx,
kind: ErrorKind::UnclosedBracket,
})
}
}
let value = parse_value(text)?;
array.push(value);
span.end = text.idx;
text.idx += 1;
seen_comma = false;
}
Ok(TomlValue::Array(array))
}
// Inline table
b'{' => {
if text.remaining_bytes() == 0 {
return Err(Error {
start: text.idx,
end: text.idx,
kind: ErrorKind::UnclosedBracket,
});
}
let mut table = Table::default();
let mut span = text.excerpt(text.idx..);
text.idx += 1;
loop {
text.skip_whitespace();
// Empty table
if text.current_byte() == Some(b'}') {
break;
}
let (key, value) = parse_assignment(text)?;
let start = key.text.span().start;
let end = key.text.span().end;
let old_value = table.insert(key, value);
if old_value {
return Err(Error {
start,
end,
kind: ErrorKind::ReusedKey,
});
}
span.end = text.idx;
text.idx += 1;
text.skip_whitespace();
match text.current_byte() {
Some(b'}') => break,
Some(b',') => {}
Some(_) => {
return Err(Error {
start: text.idx,
end: text.idx,
kind: ErrorKind::NoCommaDelimeter,
})
}
None => {
return Err(Error {
start: span.start,
end: span.end,
kind: ErrorKind::UnclosedBracket,
})
}
}
text.idx += 1;
}
Ok(TomlValue::Table(table))
}
// ¯\_(ツ)_/¯
_ => {
let span = text.excerpt(text.idx..);
Err(Error {
start: span.start,
end: span.find_next_whitespace_or_newline().unwrap_or(text.end()),
kind: ErrorKind::UnrecognisedValue,
})
}
}
}
fn parse_num<'a>(text: &mut Text<'a>, negative: bool) -> Result<TomlValue<'a>, Error> {
let mut span = Span {
start: text.idx,
end: text.idx,
source: text.text,
};
// inf or nan
let current_byte = text.current_byte().unwrap();
if (current_byte == b'i' || current_byte == b'n') && text.remaining_bytes() >= 2 {
span.end += 2;
if span.as_str() == "inf" {
text.idx = span.end;
if negative {
return Ok(TomlValue::Float(-f64::INFINITY));
} else {
return Ok(TomlValue::Float(f64::INFINITY));
}
} else if span.as_str() == "nan" {
text.idx = span.end;
if negative {
return Ok(TomlValue::Float(-f64::NAN));
} else {
return Ok(TomlValue::Float(f64::NAN));
}
}
}
let mut has_underscores = false;
let mut is_float = false;
let mut is_time = false;
// Custom radix
let radix = if current_byte == b'0' {
match text.byte(span.end + 1) {
Some(b'b') => {
span.end += 1;
while let Some(byte) = text.byte(span.end + 1) {
if byte == b'0' || byte == b'1' {
span.end += 1;
} else if byte == b'_' {
has_underscores = true;
span.end += 1;
} else {
break;
}
}
Some(2)
}
Some(b'o') => {
span.end += 1;
while let Some(byte) = text.byte(span.end + 1) {
match byte {
b'0'..=b'7' => span.end += 1,
b'_' => {
has_underscores = true;
span.end += 1;
}
_ => break,
}
}
Some(8)
}
Some(b'x') => {
span.end += 1;
while let Some(byte) = text.byte(span.end + 1) {
match byte {
b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => span.end += 1,
b'_' => {
has_underscores = true;
span.end += 1;
}
_ => break,
}
}
Some(16)
}
_ => None,
}
} else {
None
};
if radix.is_none() {
let mut has_dash = false;
while let Some(byte) = text.byte(span.end + 1) {
match byte {
b'0'..=b'9' => {}
b'.' | b'e' | b'E' | b'+' => is_float = true,
b':' => is_time = true,
// Can be in floats (1e-4) and time (1974-12-03)
b'-' => has_dash = true,
b'_' => has_underscores = true,
_ => break,
}
span.end += 1;
}
if is_float && is_time {
return Err(Error {
start: span.start,
end: span.end,
kind: ErrorKind::InvalidNumber,
});
} else if !is_float && has_dash {
is_time = true;
}
}
if radix.is_some() {
span.start += 2;
}
text.idx = span.end;
let source = if has_underscores {
let mut string = String::with_capacity(span.len());
for char_ in span.as_str().chars() {
if char_ != '_' {
string.push(char_);
}
}
CowSpan::Modified(span, string)
} else {
CowSpan::Raw(span)
};
let span = source.span();
if is_float {
// Unfortunately, the f64 parser doesn't give detailed error information, so this is the best we can do.
if let Ok(num) = source.as_str().parse::<f64>() {
if negative {
return Ok(TomlValue::Float(-num));
} else {
return Ok(TomlValue::Float(num));
}
}
}
if is_time && !negative {
todo!("Time types")
}
match i64::from_str_radix(source.as_str(), radix.unwrap_or(10)) {
Ok(num) => {
if negative {
return Ok(TomlValue::Integer(-num));
} else {
return Ok(TomlValue::Integer(num));
}
}
Err(e) => match e.kind() {
IntErrorKind::PosOverflow => {
// i64::MIN, as a string, without the sign
if negative && source.as_str() == "9223372036854775808" {
return Ok(TomlValue::Integer(i64::MIN));
}
return Err(Error {
start: span.start,
end: span.end,
kind: ErrorKind::NumberTooLarge,
});
}
IntErrorKind::InvalidDigit => {}
IntErrorKind::Empty => {
return Err(Error {
start: span.start,
end: span.end,
kind: ErrorKind::InvalidNumber,
})
}
_ => unreachable!(),
},
}
Err(Error {
start: span.start,
end: span.find_next_whitespace_or_newline().unwrap_or(text.end()),
kind: ErrorKind::UnrecognisedValue,
})
}
/// Parses a string. Supports literal and basic strings. Handles basic string escapes
/// automatically.
pub fn parse_string<'a>(text: &mut Text<'a>) -> Result<CowSpan<'a>, Error> {
let mut span = text.excerpt(text.idx..);
match text.current_byte().unwrap() {
b'\'' => {
let (end, offset) = if text.remaining_bytes() > 5
&& text.excerpt(text.idx..text.idx + 3).to_str() == "'''"
{
// Multi-line string
span.start += 3;
if text.byte(span.start).unwrap() == b'\n' {
span.start += 1;
}
(
span.as_str().find("'''").map(|idx| {
let mut idx = span.start + idx;
while text.byte(idx) == Some(b'\'') {
idx += 1;
}
idx - 3
}),
3,
)
} else {
// Single-line string
span.start += 1;
(span.find(b'\''), 1)
};
let Some(end) = end else {
return Err(Error {
start: text.idx,
end: span.find_next_whitespace_or_newline().unwrap_or(text.end()),
kind: ErrorKind::UnclosedString,
});
};
span.end = end - 1;
text.idx = span.end + offset;
Ok(CowSpan::Raw(span))
}
b'"' => {
let multiline = text.remaining_bytes() > 5
&& text.excerpt(text.idx..text.idx + 3).to_str() == "\"\"\"";
let offset = if multiline { 3 } else { 1 };
let start = span.start;
let Some(end) = find_basic_string_end(&mut span, text, multiline) else {
return Err(Error {
start: text.idx,
end: span.find_next_whitespace_or_newline().unwrap_or(text.end()),
kind: ErrorKind::UnclosedString,
});
};
span.start = start + offset;
span.end = end - 1;
if multiline && text.byte(span.start).unwrap() == b'\n' {
span.start += 1;
}
text.idx = span.end + offset;
if span.find(b'\\').is_some() {
handle_basic_string_escapes(text, span)
} else {
Ok(CowSpan::Raw(span))
}
}
_ => unreachable!(),
}
}
fn find_basic_string_end(span: &mut Span<'_>, text: &Text<'_>, multiline: bool) -> Option<usize> {
let end = if multiline {
// Multi-line string
span.start += 3;
span.as_str().find("\"\"\"").map(|idx| {
let mut idx = span.start + idx;
while text.byte(idx) == Some(b'"') {
idx += 1;
}
idx - 3
})
} else {
// Single-line string
span.start += 1;
span.find(b'"')
};
if let Some(end) = end {
if text.byte(end - 1).unwrap() == b'\\' && text.byte(end - 2).unwrap() != b'\\' {
span.start = end;
find_basic_string_end(span, text, multiline)
} else {
Some(end)
}
} else {
None
}
}
fn handle_basic_string_escapes<'a>(text: &Text<'a>, span: Span<'a>) -> Result<CowSpan<'a>, Error> {
let mut string = String::with_capacity(span.len());
let mut chars = span.as_str().char_indices().peekable();
while let Some((idx, char)) = chars.next() {
let idx = span.start + idx;
if char == '\\' {
let Some((idx, char)) = chars.next() else {
return Err(Error {
start: idx,
end: idx,
kind: ErrorKind::UnknownEscapeSequence,
});
};
let idx = span.start + idx;
let to_push = match char {
'b' => '\u{0008}',
't' => '\t',
'n' => '\n',
'f' => '\u{000C}',
'r' => '\r',
'"' => '"',
'\\' => '\\',
'u' => {
if idx + 4 > text.end() {
return Err(Error {
start: idx,
end: idx + 4,
kind: ErrorKind::UnknownUnicodeScalar,
});
}
let source = text.excerpt(idx + 1..=idx + 4);
let Some(char) = u32::from_str_radix(source.as_str(), 16)
.ok()
.and_then(char::from_u32)
else {
return Err(Error {
start: idx,
end: idx + 5,
kind: ErrorKind::UnknownUnicodeScalar,
});
};
chars.nth(3).unwrap();
char
}
'U' => {
if idx + 8 > text.end() {
return Err(Error {
start: idx,
end: idx + 8,
kind: ErrorKind::UnknownUnicodeScalar,
});
}
let source = text.excerpt(idx + 1..=idx + 8);
let Some(char) = u32::from_str_radix(source.as_str(), 16)
.ok()
.and_then(char::from_u32)
else {
return Err(Error {
start: idx,
end: idx + 8,
kind: ErrorKind::UnknownUnicodeScalar,
});
};
chars.nth(7).unwrap();
char
}
' ' | '\t' | '\n' | '\r' => {
while let Some((_, char_)) = chars.peek() {
let char_ = *char_;
if char_ != ' ' && char_ != '\t' && char_ != '\n' && char_ != '\r' {
break;
}
chars.next();
}
continue;
}
_ => {
return Err(Error {
start: span.start + idx,
end: span.start + idx + 1,
kind: ErrorKind::UnknownEscapeSequence,
})
}
};
string.push(to_push);
continue;
}
string.push(char);
}
Ok(CowSpan::Modified(span, string))
}