| use std::borrow::Cow; |
| use std::char; |
| use std::ops::RangeInclusive; |
| |
| use winnow::combinator::alt; |
| use winnow::combinator::cut_err; |
| use winnow::combinator::delimited; |
| use winnow::combinator::fail; |
| use winnow::combinator::opt; |
| use winnow::combinator::peek; |
| use winnow::combinator::preceded; |
| use winnow::combinator::repeat; |
| use winnow::combinator::success; |
| use winnow::combinator::terminated; |
| use winnow::prelude::*; |
| use winnow::stream::Stream; |
| use winnow::token::any; |
| use winnow::token::none_of; |
| use winnow::token::one_of; |
| use winnow::token::tag; |
| use winnow::token::take_while; |
| use winnow::trace::trace; |
| |
| use crate::parser::error::CustomError; |
| use crate::parser::numbers::HEXDIG; |
| use crate::parser::prelude::*; |
| use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR}; |
| |
| // ;; String |
| |
| // string = ml-basic-string / basic-string / ml-literal-string / literal-string |
| pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| trace( |
| "string", |
| alt(( |
| ml_basic_string, |
| basic_string, |
| ml_literal_string, |
| literal_string.map(Cow::Borrowed), |
| )), |
| ) |
| .parse_next(input) |
| } |
| |
| // ;; Basic String |
| |
| // basic-string = quotation-mark *basic-char quotation-mark |
| pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| trace("basic-string", |input: &mut Input<'i>| { |
| let _ = one_of(QUOTATION_MARK).parse_next(input)?; |
| |
| let mut c = Cow::Borrowed(""); |
| if let Some(ci) = opt(basic_chars).parse_next(input)? { |
| c = ci; |
| } |
| while let Some(ci) = opt(basic_chars).parse_next(input)? { |
| c.to_mut().push_str(&ci); |
| } |
| |
| let _ = cut_err(one_of(QUOTATION_MARK)) |
| .context(StrContext::Label("basic string")) |
| .parse_next(input)?; |
| |
| Ok(c) |
| }) |
| .parse_next(input) |
| } |
| |
| // quotation-mark = %x22 ; " |
| pub(crate) const QUOTATION_MARK: u8 = b'"'; |
| |
| // basic-char = basic-unescaped / escaped |
| fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| alt(( |
| // Deviate from the official grammar by batching the unescaped chars so we build a string a |
| // chunk at a time, rather than a `char` at a time. |
| take_while(1.., BASIC_UNESCAPED) |
| .try_map(std::str::from_utf8) |
| .map(Cow::Borrowed), |
| escaped.map(|c| Cow::Owned(String::from(c))), |
| )) |
| .parse_next(input) |
| } |
| |
| // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
| pub(crate) const BASIC_UNESCAPED: ( |
| (u8, u8), |
| u8, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
| |
| // escaped = escape escape-seq-char |
| fn escaped(input: &mut Input<'_>) -> PResult<char> { |
| preceded(ESCAPE, escape_seq_char).parse_next(input) |
| } |
| |
| // escape = %x5C ; \ |
| pub(crate) const ESCAPE: u8 = b'\\'; |
| |
| // escape-seq-char = %x22 ; " quotation mark U+0022 |
| // escape-seq-char =/ %x5C ; \ reverse solidus U+005C |
| // escape-seq-char =/ %x62 ; b backspace U+0008 |
| // escape-seq-char =/ %x66 ; f form feed U+000C |
| // escape-seq-char =/ %x6E ; n line feed U+000A |
| // escape-seq-char =/ %x72 ; r carriage return U+000D |
| // escape-seq-char =/ %x74 ; t tab U+0009 |
| // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX |
| // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX |
| fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> { |
| dispatch! {any; |
| b'b' => success('\u{8}'), |
| b'f' => success('\u{c}'), |
| b'n' => success('\n'), |
| b'r' => success('\r'), |
| b't' => success('\t'), |
| b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")), |
| b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")), |
| b'\\' => success('\\'), |
| b'"' => success('"'), |
| _ => { |
| cut_err(fail::<_, char, _>) |
| .context(StrContext::Label("escape sequence")) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('b'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('f'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('n'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('r'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('t'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('u'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('U'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('\\'))) |
| .context(StrContext::Expected(StrContextValue::CharLiteral('"'))) |
| } |
| } |
| .parse_next(input) |
| } |
| |
| pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> { |
| take_while(0..=N, HEXDIG) |
| .verify(|b: &[u8]| b.len() == N) |
| .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") }) |
| .verify_map(|s| u32::from_str_radix(s, 16).ok()) |
| .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange)) |
| .parse_next(input) |
| } |
| |
| // ;; Multiline Basic String |
| |
| // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body |
| // ml-basic-string-delim |
| fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| trace( |
| "ml-basic-string", |
| delimited( |
| ML_BASIC_STRING_DELIM, |
| preceded(opt(newline), cut_err(ml_basic_body)), |
| cut_err(ML_BASIC_STRING_DELIM), |
| ) |
| .context(StrContext::Label("multiline basic string")), |
| ) |
| .parse_next(input) |
| } |
| |
| // ml-basic-string-delim = 3quotation-mark |
| pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\""; |
| |
| // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] |
| fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| let mut c = Cow::Borrowed(""); |
| if let Some(ci) = opt(mlb_content).parse_next(input)? { |
| c = ci; |
| } |
| while let Some(ci) = opt(mlb_content).parse_next(input)? { |
| c.to_mut().push_str(&ci); |
| } |
| |
| while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? { |
| if let Some(ci) = opt(mlb_content).parse_next(input)? { |
| c.to_mut().push_str(qi); |
| c.to_mut().push_str(&ci); |
| while let Some(ci) = opt(mlb_content).parse_next(input)? { |
| c.to_mut().push_str(&ci); |
| } |
| } else { |
| break; |
| } |
| } |
| |
| if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? { |
| c.to_mut().push_str(qi); |
| } |
| |
| Ok(c) |
| } |
| |
| // mlb-content = mlb-char / newline / mlb-escaped-nl |
| // mlb-char = mlb-unescaped / escaped |
| fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| alt(( |
| // Deviate from the official grammar by batching the unescaped chars so we build a string a |
| // chunk at a time, rather than a `char` at a time. |
| take_while(1.., MLB_UNESCAPED) |
| .try_map(std::str::from_utf8) |
| .map(Cow::Borrowed), |
| // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences |
| mlb_escaped_nl.map(|_| Cow::Borrowed("")), |
| escaped.map(|c| Cow::Owned(String::from(c))), |
| newline.map(|_| Cow::Borrowed("\n")), |
| )) |
| .parse_next(input) |
| } |
| |
| // mlb-quotes = 1*2quotation-mark |
| fn mlb_quotes<'i>( |
| mut term: impl winnow::Parser<Input<'i>, (), ContextError>, |
| ) -> impl Parser<Input<'i>, &'i str, ContextError> { |
| move |input: &mut Input<'i>| { |
| let start = input.checkpoint(); |
| let res = terminated(b"\"\"", peek(term.by_ref())) |
| .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) |
| .parse_next(input); |
| |
| match res { |
| Err(winnow::error::ErrMode::Backtrack(_)) => { |
| input.reset(start); |
| terminated(b"\"", peek(term.by_ref())) |
| .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) |
| .parse_next(input) |
| } |
| res => res, |
| } |
| } |
| } |
| |
| // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
| pub(crate) const MLB_UNESCAPED: ( |
| (u8, u8), |
| u8, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
| |
| // mlb-escaped-nl = escape ws newline *( wschar / newline |
| // When the last non-whitespace character on a line is a \, |
| // it will be trimmed along with all whitespace |
| // (including newlines) up to the next non-whitespace |
| // character or closing delimiter. |
| fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> { |
| repeat(1.., (ESCAPE, ws, ws_newlines)) |
| .map(|()| ()) |
| .value(()) |
| .parse_next(input) |
| } |
| |
| // ;; Literal String |
| |
| // literal-string = apostrophe *literal-char apostrophe |
| pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> { |
| trace( |
| "literal-string", |
| delimited( |
| APOSTROPHE, |
| cut_err(take_while(0.., LITERAL_CHAR)), |
| cut_err(APOSTROPHE), |
| ) |
| .try_map(std::str::from_utf8) |
| .context(StrContext::Label("literal string")), |
| ) |
| .parse_next(input) |
| } |
| |
| // apostrophe = %x27 ; ' apostrophe |
| pub(crate) const APOSTROPHE: u8 = b'\''; |
| |
| // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii |
| pub(crate) const LITERAL_CHAR: ( |
| u8, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
| |
| // ;; Multiline Literal String |
| |
| // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body |
| // ml-literal-string-delim |
| fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
| trace( |
| "ml-literal-string", |
| delimited( |
| (ML_LITERAL_STRING_DELIM, opt(newline)), |
| cut_err(ml_literal_body.map(|t| { |
| if t.contains("\r\n") { |
| Cow::Owned(t.replace("\r\n", "\n")) |
| } else { |
| Cow::Borrowed(t) |
| } |
| })), |
| cut_err(ML_LITERAL_STRING_DELIM), |
| ) |
| .context(StrContext::Label("multiline literal string")), |
| ) |
| .parse_next(input) |
| } |
| |
| // ml-literal-string-delim = 3apostrophe |
| pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''"; |
| |
| // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] |
| fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> { |
| ( |
| repeat(0.., mll_content).map(|()| ()), |
| repeat( |
| 0.., |
| ( |
| mll_quotes(none_of(APOSTROPHE).value(())), |
| repeat(1.., mll_content).map(|()| ()), |
| ), |
| ) |
| .map(|()| ()), |
| opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))), |
| ) |
| .recognize() |
| .try_map(std::str::from_utf8) |
| .parse_next(input) |
| } |
| |
| // mll-content = mll-char / newline |
| fn mll_content(input: &mut Input<'_>) -> PResult<u8> { |
| alt((one_of(MLL_CHAR), newline)).parse_next(input) |
| } |
| |
| // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii |
| const MLL_CHAR: ( |
| u8, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| RangeInclusive<u8>, |
| ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
| |
| // mll-quotes = 1*2apostrophe |
| fn mll_quotes<'i>( |
| mut term: impl winnow::Parser<Input<'i>, (), ContextError>, |
| ) -> impl Parser<Input<'i>, &'i str, ContextError> { |
| move |input: &mut Input<'i>| { |
| let start = input.checkpoint(); |
| let res = terminated(b"''", peek(term.by_ref())) |
| .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) |
| .parse_next(input); |
| |
| match res { |
| Err(winnow::error::ErrMode::Backtrack(_)) => { |
| input.reset(start); |
| terminated(b"'", peek(term.by_ref())) |
| .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) |
| .parse_next(input) |
| } |
| res => res, |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| #[cfg(feature = "parse")] |
| #[cfg(feature = "display")] |
| mod test { |
| use super::*; |
| |
| #[test] |
| fn basic_string() { |
| let input = |
| r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#; |
| let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}"; |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| |
| #[test] |
| fn ml_basic_string() { |
| let cases = [ |
| ( |
| r#"""" |
| Roses are red |
| Violets are blue""""#, |
| r#"Roses are red |
| Violets are blue"#, |
| ), |
| (r#"""" \""" """"#, " \"\"\" "), |
| (r#"""" \\""""#, " \\"), |
| ]; |
| |
| for &(input, expected) in &cases { |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| |
| let invalid_cases = [r#"""" """#, r#"""" \""""#]; |
| |
| for input in &invalid_cases { |
| let parsed = string.parse(new_input(input)); |
| assert!(parsed.is_err()); |
| } |
| } |
| |
| #[test] |
| fn ml_basic_string_escape_ws() { |
| let inputs = [ |
| r#"""" |
| The quick brown \ |
| |
| |
| fox jumps over \ |
| the lazy dog.""""#, |
| r#""""\ |
| The quick brown \ |
| fox jumps over \ |
| the lazy dog.\ |
| """"#, |
| ]; |
| for input in &inputs { |
| let expected = "The quick brown fox jumps over the lazy dog."; |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| let empties = [ |
| r#""""\ |
| """"#, |
| r#"""" |
| \ |
| \ |
| """"#, |
| ]; |
| for input in &empties { |
| let expected = ""; |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| } |
| |
| #[test] |
| fn literal_string() { |
| let inputs = [ |
| r"'C:\Users\nodejs\templates'", |
| r"'\\ServerX\admin$\system32\'", |
| r#"'Tom "Dubs" Preston-Werner'"#, |
| r"'<\i\c*\s*>'", |
| ]; |
| |
| for input in &inputs { |
| let expected = &input[1..input.len() - 1]; |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| } |
| |
| #[test] |
| fn ml_literal_string() { |
| let inputs = [ |
| r"'''I [dw]on't need \d{2} apples'''", |
| r#"''''one_quote''''"#, |
| ]; |
| for input in &inputs { |
| let expected = &input[3..input.len() - 3]; |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| |
| let input = r#"''' |
| The first newline is |
| trimmed in raw strings. |
| All other whitespace |
| is preserved. |
| '''"#; |
| let expected = &input[4..input.len() - 3]; |
| let parsed = string.parse(new_input(input)); |
| assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); |
| } |
| } |