| //! A set of helper functions for unescaping Fluent unicode escape sequences. |
| //! |
| //! # Unicode |
| //! |
| //! Fluent supports UTF-8 in all FTL resources, but it also allows |
| //! unicode sequences to be escaped in [`String |
| //! Literals`](super::ast::InlineExpression::StringLiteral). |
| //! |
| //! Four byte sequences are encoded with `\u` and six byte |
| //! sequences using `\U`. |
| //! ## Example |
| //! |
| //! ``` |
| //! use fluent_syntax::unicode::unescape_unicode_to_string; |
| //! |
| //! assert_eq!( |
| //! unescape_unicode_to_string("Foo \\u5bd2 Bar"), |
| //! "Foo 寒 Bar" |
| //! ); |
| //! |
| //! assert_eq!( |
| //! unescape_unicode_to_string("Foo \\U01F68A Bar"), |
| //! "Foo 🚊 Bar" |
| //! ); |
| //! ``` |
| //! |
| //! # Other unescapes |
| //! |
| //! This also allows for a char `"` to be present inside an FTL string literal, |
| //! and for `\` itself to be escaped. |
| //! |
| //! ## Example |
| //! |
| //! ``` |
| //! use fluent_syntax::unicode::unescape_unicode_to_string; |
| //! |
| //! assert_eq!( |
| //! unescape_unicode_to_string("Foo \\\" Bar"), |
| //! "Foo \" Bar" |
| //! ); |
| //! assert_eq!( |
| //! unescape_unicode_to_string("Foo \\\\ Bar"), |
| //! "Foo \\ Bar" |
| //! ); |
| //! ``` |
| use std::borrow::Cow; |
| use std::char; |
| use std::fmt; |
| |
| const UNKNOWN_CHAR: char = '�'; |
| |
| fn encode_unicode(s: Option<&str>) -> char { |
| s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32)) |
| .unwrap_or(UNKNOWN_CHAR) |
| } |
| |
| /// Unescapes to a writer without allocating. |
| /// |
| /// ## Example |
| /// |
| /// ``` |
| /// use fluent_syntax::unicode::unescape_unicode; |
| /// |
| /// let mut s = String::new(); |
| /// unescape_unicode(&mut s, "Foo \\U01F60A Bar"); |
| /// assert_eq!(s, "Foo 😊 Bar"); |
| /// ``` |
| pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result |
| where |
| W: fmt::Write, |
| { |
| let bytes = input.as_bytes(); |
| |
| let mut start = 0; |
| let mut ptr = 0; |
| |
| while let Some(b) = bytes.get(ptr) { |
| if b != &b'\\' { |
| ptr += 1; |
| continue; |
| } |
| if start != ptr { |
| w.write_str(&input[start..ptr])?; |
| } |
| |
| ptr += 1; |
| |
| let new_char = match bytes.get(ptr) { |
| Some(b'\\') => '\\', |
| Some(b'"') => '"', |
| Some(u @ b'u') | Some(u @ b'U') => { |
| let seq_start = ptr + 1; |
| let len = if u == &b'u' { 4 } else { 6 }; |
| ptr += len; |
| encode_unicode(input.get(seq_start..seq_start + len)) |
| } |
| _ => UNKNOWN_CHAR, |
| }; |
| ptr += 1; |
| w.write_char(new_char)?; |
| start = ptr; |
| } |
| if start != ptr { |
| w.write_str(&input[start..ptr])?; |
| } |
| Ok(()) |
| } |
| |
| /// Unescapes to a `Cow<str>` optionally allocating. |
| /// |
| /// ## Example |
| /// |
| /// ``` |
| /// use fluent_syntax::unicode::unescape_unicode_to_string; |
| /// |
| /// assert_eq!( |
| /// unescape_unicode_to_string("Foo \\U01F60A Bar"), |
| /// "Foo 😊 Bar" |
| /// ); |
| /// ``` |
| pub fn unescape_unicode_to_string(input: &str) -> Cow<str> { |
| let bytes = input.as_bytes(); |
| let mut result = Cow::from(input); |
| |
| let mut ptr = 0; |
| |
| while let Some(b) = bytes.get(ptr) { |
| if b != &b'\\' { |
| if let Cow::Owned(ref mut s) = result { |
| s.push(*b as char); |
| } |
| ptr += 1; |
| continue; |
| } |
| |
| if let Cow::Borrowed(_) = result { |
| result = Cow::from(&input[0..ptr]); |
| } |
| |
| ptr += 1; |
| |
| let new_char = match bytes.get(ptr) { |
| Some(b'\\') => '\\', |
| Some(b'"') => '"', |
| Some(u @ b'u') | Some(u @ b'U') => { |
| let start = ptr + 1; |
| let len = if u == &b'u' { 4 } else { 6 }; |
| ptr += len; |
| input |
| .get(start..(start + len)) |
| .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice))) |
| } |
| _ => UNKNOWN_CHAR, |
| }; |
| result.to_mut().push(new_char); |
| ptr += 1; |
| } |
| result |
| } |