Importing rustc-1.71.0 Bug: 288268986 Test: ./build.py --lto=thin Change-Id: Idffe995937cee91f5b480ce61ef8267475cec993

commit: cd1aefd586783f162dd848e314bd6991a5ffe033 [log] [tgz]
author: Chris Wailes <[email protected]> Thu Jul 13 13:36:21 2023 -0700
committer: Chris Wailes <[email protected]> Thu Jul 13 13:48:30 2023 -0700
tree: fc7d1a7cd34f48915fc5e4d9c89a8b94e5ddaf7c
parent: 635618df8991b8a005e435895ea0b1eee7e3faf0 [diff]
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index b3f4b5c..29335a8 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs

@@ -186,12 +186,16 @@
     Str { terminated: bool },
     /// "b"abc"", "b"abc"
     ByteStr { terminated: bool },
+    /// `c"abc"`, `c"abc`
+    CStr { terminated: bool },
     /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
     /// an invalid literal.
     RawStr { n_hashes: Option<u8> },
     /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
     /// indicates an invalid literal.
     RawByteStr { n_hashes: Option<u8> },
+    /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
+    RawCStr { n_hashes: Option<u8> },
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@@ -357,39 +361,11 @@
             },
 
             // Byte literal, byte string literal, raw byte string literal or identifier.
-            'b' => match (self.first(), self.second()) {
-                ('\'', _) => {
-                    self.bump();
-                    let terminated = self.single_quoted_string();
-                    let suffix_start = self.pos_within_token();
-                    if terminated {
-                        self.eat_literal_suffix();
-                    }
-                    let kind = Byte { terminated };
-                    Literal { kind, suffix_start }
-                }
-                ('"', _) => {
-                    self.bump();
-                    let terminated = self.double_quoted_string();
-                    let suffix_start = self.pos_within_token();
-                    if terminated {
-                        self.eat_literal_suffix();
-                    }
-                    let kind = ByteStr { terminated };
-                    Literal { kind, suffix_start }
-                }
-                ('r', '"') | ('r', '#') => {
-                    self.bump();
-                    let res = self.raw_double_quoted_string(2);
-                    let suffix_start = self.pos_within_token();
-                    if res.is_ok() {
-                        self.eat_literal_suffix();
-                    }
-                    let kind = RawByteStr { n_hashes: res.ok() };
-                    Literal { kind, suffix_start }
-                }
-                _ => self.ident_or_unknown_prefix(),
-            },
+            'b' => self.c_or_byte_string(
+                |terminated| ByteStr { terminated },
+                |n_hashes| RawByteStr { n_hashes },
+                Some(|terminated| Byte { terminated }),
+            ),
 
             // Identifier (this should be checked after other variant that can
             // start as identifier).
@@ -553,39 +529,84 @@
         }
     }
 
+    fn c_or_byte_string(
+        &mut self,
+        mk_kind: impl FnOnce(bool) -> LiteralKind,
+        mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
+        single_quoted: Option<fn(bool) -> LiteralKind>,
+    ) -> TokenKind {
+        match (self.first(), self.second(), single_quoted) {
+            ('\'', _, Some(mk_kind)) => {
+                self.bump();
+                let terminated = self.single_quoted_string();
+                let suffix_start = self.pos_within_token();
+                if terminated {
+                    self.eat_literal_suffix();
+                }
+                let kind = mk_kind(terminated);
+                Literal { kind, suffix_start }
+            }
+            ('"', _, _) => {
+                self.bump();
+                let terminated = self.double_quoted_string();
+                let suffix_start = self.pos_within_token();
+                if terminated {
+                    self.eat_literal_suffix();
+                }
+                let kind = mk_kind(terminated);
+                Literal { kind, suffix_start }
+            }
+            ('r', '"', _) | ('r', '#', _) => {
+                self.bump();
+                let res = self.raw_double_quoted_string(2);
+                let suffix_start = self.pos_within_token();
+                if res.is_ok() {
+                    self.eat_literal_suffix();
+                }
+                let kind = mk_kind_raw(res.ok());
+                Literal { kind, suffix_start }
+            }
+            _ => self.ident_or_unknown_prefix(),
+        }
+    }
+
     fn number(&mut self, first_digit: char) -> LiteralKind {
         debug_assert!('0' <= self.prev() && self.prev() <= '9');
         let mut base = Base::Decimal;
         if first_digit == '0' {
             // Attempt to parse encoding base.
-            let has_digits = match self.first() {
+            match self.first() {
                 'b' => {
                     base = Base::Binary;
                     self.bump();
-                    self.eat_decimal_digits()
+                    if !self.eat_decimal_digits() {
+                        return Int { base, empty_int: true };
+                    }
                 }
                 'o' => {
                     base = Base::Octal;
                     self.bump();
-                    self.eat_decimal_digits()
+                    if !self.eat_decimal_digits() {
+                        return Int { base, empty_int: true };
+                    }
                 }
                 'x' => {
                     base = Base::Hexadecimal;
                     self.bump();
-                    self.eat_hexadecimal_digits()
+                    if !self.eat_hexadecimal_digits() {
+                        return Int { base, empty_int: true };
+                    }
                 }
-                // Not a base prefix.
-                '0'..='9' | '_' | '.' | 'e' | 'E' => {
+                // Not a base prefix; consume additional digits.
+                '0'..='9' | '_' => {
                     self.eat_decimal_digits();
-                    true
                 }
+
+                // Also not a base prefix; nothing more to do here.
+                '.' | 'e' | 'E' => {}
+
                 // Just a 0.
                 _ => return Int { base, empty_int: false },
-            };
-            // Base prefix was provided, but there were no digits
-            // after it, e.g. "0x".
-            if !has_digits {
-                return Int { base, empty_int: true };
             }
         } else {
             // No base prefix, parse number in the usual way.

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index bb4d912..c9ad54d 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs

@@ -86,10 +86,45 @@
             let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte);
             callback(0..(src.len() - chars.as_str().len()), res);
         }
-        Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback),
+        Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback),
+
         Mode::RawStr | Mode::RawByteStr => {
             unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
         }
+        Mode::CStr | Mode::RawCStr => unreachable!(),
+    }
+}
+
+/// A unit within CStr. Must not be a nul character.
+pub enum CStrUnit {
+    Byte(u8),
+    Char(char),
+}
+
+impl From<u8> for CStrUnit {
+    fn from(value: u8) -> Self {
+        CStrUnit::Byte(value)
+    }
+}
+
+impl From<char> for CStrUnit {
+    fn from(value: char) -> Self {
+        CStrUnit::Char(value)
+    }
+}
+
+pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
+{
+    if mode == Mode::RawCStr {
+        unescape_raw_str_or_raw_byte_str(
+            src,
+            mode.characters_should_be_ascii(),
+            &mut |r, result| callback(r, result.map(CStrUnit::Char)),
+        );
+    } else {
+        unescape_str_common(src, mode, callback);
     }
 }
 
@@ -114,34 +149,69 @@
     ByteStr,
     RawStr,
     RawByteStr,
+    CStr,
+    RawCStr,
 }
 
 impl Mode {
     pub fn in_double_quotes(self) -> bool {
         match self {
-            Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
+            Mode::Str
+            | Mode::ByteStr
+            | Mode::RawStr
+            | Mode::RawByteStr
+            | Mode::CStr
+            | Mode::RawCStr => true,
             Mode::Char | Mode::Byte => false,
         }
     }
 
-    pub fn is_byte(self) -> bool {
+    /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
+    pub fn ascii_escapes_should_be_ascii(self) -> bool {
+        match self {
+            Mode::Char | Mode::Str | Mode::RawStr => true,
+            Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false,
+        }
+    }
+
+    /// Whether characters within the literal must be within the ASCII range
+    pub fn characters_should_be_ascii(self) -> bool {
         match self {
             Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
-            Mode::Char | Mode::Str | Mode::RawStr => false,
+            Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+        }
+    }
+
+    /// Byte literals do not allow unicode escape.
+    pub fn is_unicode_escape_disallowed(self) -> bool {
+        match self {
+            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
+            Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+        }
+    }
+
+    pub fn prefix_noraw(self) -> &'static str {
+        match self {
+            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
+            Mode::CStr | Mode::RawCStr => "c",
+            Mode::Char | Mode::Str | Mode::RawStr => "",
         }
     }
 }
 
-fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
+fn scan_escape<T: From<u8> + From<char>>(
+    chars: &mut Chars<'_>,
+    mode: Mode,
+) -> Result<T, EscapeError> {
     // Previous character was '\\', unescape what follows.
     let res = match chars.next().ok_or(EscapeError::LoneSlash)? {
-        '"' => '"',
-        'n' => '\n',
-        'r' => '\r',
-        't' => '\t',
-        '\\' => '\\',
-        '\'' => '\'',
-        '0' => '\0',
+        '"' => b'"',
+        'n' => b'\n',
+        'r' => b'\r',
+        't' => b'\t',
+        '\\' => b'\\',
+        '\'' => b'\'',
+        '0' => b'\0',
 
         'x' => {
             // Parse hexadecimal character code.
@@ -154,76 +224,78 @@
 
             let value = hi * 16 + lo;
 
-            // For a non-byte literal verify that it is within ASCII range.
-            if !is_byte && !is_ascii(value) {
+            if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) {
                 return Err(EscapeError::OutOfRangeHexEscape);
             }
-            let value = value as u8;
 
-            value as char
+            value as u8
         }
 
-        'u' => {
-            // We've parsed '\u', now we have to parse '{..}'.
-
-            if chars.next() != Some('{') {
-                return Err(EscapeError::NoBraceInUnicodeEscape);
-            }
-
-            // First character must be a hexadecimal digit.
-            let mut n_digits = 1;
-            let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
-                '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
-                '}' => return Err(EscapeError::EmptyUnicodeEscape),
-                c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
-            };
-
-            // First character is valid, now parse the rest of the number
-            // and closing brace.
-            loop {
-                match chars.next() {
-                    None => return Err(EscapeError::UnclosedUnicodeEscape),
-                    Some('_') => continue,
-                    Some('}') => {
-                        if n_digits > 6 {
-                            return Err(EscapeError::OverlongUnicodeEscape);
-                        }
-
-                        // Incorrect syntax has higher priority for error reporting
-                        // than unallowed value for a literal.
-                        if is_byte {
-                            return Err(EscapeError::UnicodeEscapeInByte);
-                        }
-
-                        break std::char::from_u32(value).ok_or_else(|| {
-                            if value > 0x10FFFF {
-                                EscapeError::OutOfRangeUnicodeEscape
-                            } else {
-                                EscapeError::LoneSurrogateUnicodeEscape
-                            }
-                        })?;
-                    }
-                    Some(c) => {
-                        let digit: u32 =
-                            c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
-                        n_digits += 1;
-                        if n_digits > 6 {
-                            // Stop updating value since we're sure that it's incorrect already.
-                            continue;
-                        }
-                        value = value * 16 + digit;
-                    }
-                };
-            }
-        }
+        'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into),
         _ => return Err(EscapeError::InvalidEscape),
     };
-    Ok(res)
+    Ok(res.into())
+}
+
+fn scan_unicode(
+    chars: &mut Chars<'_>,
+    is_unicode_escape_disallowed: bool,
+) -> Result<char, EscapeError> {
+    // We've parsed '\u', now we have to parse '{..}'.
+
+    if chars.next() != Some('{') {
+        return Err(EscapeError::NoBraceInUnicodeEscape);
+    }
+
+    // First character must be a hexadecimal digit.
+    let mut n_digits = 1;
+    let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
+        '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
+        '}' => return Err(EscapeError::EmptyUnicodeEscape),
+        c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
+    };
+
+    // First character is valid, now parse the rest of the number
+    // and closing brace.
+    loop {
+        match chars.next() {
+            None => return Err(EscapeError::UnclosedUnicodeEscape),
+            Some('_') => continue,
+            Some('}') => {
+                if n_digits > 6 {
+                    return Err(EscapeError::OverlongUnicodeEscape);
+                }
+
+                // Incorrect syntax has higher priority for error reporting
+                // than unallowed value for a literal.
+                if is_unicode_escape_disallowed {
+                    return Err(EscapeError::UnicodeEscapeInByte);
+                }
+
+                break std::char::from_u32(value).ok_or_else(|| {
+                    if value > 0x10FFFF {
+                        EscapeError::OutOfRangeUnicodeEscape
+                    } else {
+                        EscapeError::LoneSurrogateUnicodeEscape
+                    }
+                });
+            }
+            Some(c) => {
+                let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
+                n_digits += 1;
+                if n_digits > 6 {
+                    // Stop updating value since we're sure that it's incorrect already.
+                    continue;
+                }
+                value = value * 16 + digit;
+            }
+        };
+    }
 }
 
 #[inline]
-fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
-    if is_byte && !c.is_ascii() {
+fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result<char, EscapeError> {
+    if characters_should_be_ascii && !c.is_ascii() {
         // Byte literal can't be a non-ascii character.
         Err(EscapeError::NonAsciiCharInByte)
     } else {
@@ -234,7 +306,7 @@
 fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
     let c = chars.next().ok_or(EscapeError::ZeroChars)?;
     let res = match c {
-        '\\' => scan_escape(chars, is_byte),
+        '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }),
         '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
         '\r' => Err(EscapeError::BareCarriageReturn),
         _ => ascii_check(c, is_byte),
@@ -247,9 +319,9 @@
 
 /// Takes a contents of a string literal (without quotes) and produces a
 /// sequence of escaped characters or errors.
-fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
+fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
 where
-    F: FnMut(Range<usize>, Result<char, EscapeError>),
+    F: FnMut(Range<usize>, Result<T, EscapeError>),
 {
     let mut chars = src.chars();
 
@@ -266,47 +338,49 @@
                         // if unescaped '\' character is followed by '\n'.
                         // For details see [Rust language reference]
                         // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
-                        skip_ascii_whitespace(&mut chars, start, callback);
+                        skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
+                            callback(range, Err(err))
+                        });
                         continue;
                     }
-                    _ => scan_escape(&mut chars, is_byte),
+                    _ => scan_escape::<T>(&mut chars, mode),
                 }
             }
-            '\n' => Ok('\n'),
-            '\t' => Ok('\t'),
+            '\n' => Ok(b'\n'.into()),
+            '\t' => Ok(b'\t'.into()),
             '"' => Err(EscapeError::EscapeOnlyChar),
             '\r' => Err(EscapeError::BareCarriageReturn),
-            _ => ascii_check(c, is_byte),
+            _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into),
         };
         let end = src.len() - chars.as_str().len();
-        callback(start..end, res);
+        callback(start..end, res.map(Into::into));
     }
+}
 
-    fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
-    where
-        F: FnMut(Range<usize>, Result<char, EscapeError>),
-    {
-        let tail = chars.as_str();
-        let first_non_space = tail
-            .bytes()
-            .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
-            .unwrap_or(tail.len());
-        if tail[1..first_non_space].contains('\n') {
-            // The +1 accounts for the escaping slash.
-            let end = start + first_non_space + 1;
-            callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning));
-        }
-        let tail = &tail[first_non_space..];
-        if let Some(c) = tail.chars().nth(0) {
-            if c.is_whitespace() {
-                // For error reporting, we would like the span to contain the character that was not
-                // skipped. The +1 is necessary to account for the leading \ that started the escape.
-                let end = start + first_non_space + c.len_utf8() + 1;
-                callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning));
-            }
-        }
-        *chars = tail.chars();
+fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
+where
+    F: FnMut(Range<usize>, EscapeError),
+{
+    let tail = chars.as_str();
+    let first_non_space = tail
+        .bytes()
+        .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
+        .unwrap_or(tail.len());
+    if tail[1..first_non_space].contains('\n') {
+        // The +1 accounts for the escaping slash.
+        let end = start + first_non_space + 1;
+        callback(start..end, EscapeError::MultipleSkippedLinesWarning);
     }
+    let tail = &tail[first_non_space..];
+    if let Some(c) = tail.chars().nth(0) {
+        if c.is_whitespace() {
+            // For error reporting, we would like the span to contain the character that was not
+            // skipped. The +1 is necessary to account for the leading \ that started the escape.
+            let end = start + first_non_space + c.len_utf8() + 1;
+            callback(start..end, EscapeError::UnskippedWhitespaceWarning);
+        }
+    }
+    *chars = tail.chars();
 }
 
 /// Takes a contents of a string literal (without quotes) and produces a
commit	cd1aefd586783f162dd848e314bd6991a5ffe033	[log] [tgz]
author	Chris Wailes <[email protected]>	Thu Jul 13 13:36:21 2023 -0700
committer	Chris Wailes <[email protected]>	Thu Jul 13 13:48:30 2023 -0700
tree	fc7d1a7cd34f48915fc5e4d9c89a8b94e5ddaf7c
parent	635618df8991b8a005e435895ea0b1eee7e3faf0 [diff]