| use std::borrow::Cow; |
| use std::hash::Hash; |
| use std::ops::Range; |
| |
| /// Reference to a [`DiffableStr`]. |
| /// |
| /// This type exists because while the library only really provides ways to |
| /// work with `&str` and `&[u8]` there are types that deref into those string |
| /// slices such as `String` and `Vec<u8>`. |
| /// |
| /// This trait is used in the library whenever it's nice to be able to pass |
| /// strings of different types in. |
| /// |
| /// Requires the `text` feature. |
| pub trait DiffableStrRef { |
| /// The type of the resolved [`DiffableStr`]. |
| type Output: DiffableStr + ?Sized; |
| |
| /// Resolves the reference. |
| fn as_diffable_str(&self) -> &Self::Output; |
| } |
| |
| impl<T: DiffableStr + ?Sized> DiffableStrRef for T { |
| type Output = T; |
| |
| fn as_diffable_str(&self) -> &T { |
| self |
| } |
| } |
| |
| impl DiffableStrRef for String { |
| type Output = str; |
| |
| fn as_diffable_str(&self) -> &str { |
| self.as_str() |
| } |
| } |
| |
| impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> { |
| type Output = T; |
| |
| fn as_diffable_str(&self) -> &T { |
| self |
| } |
| } |
| |
| /// All supported diffable strings. |
| /// |
| /// The text module can work with different types of strings depending |
| /// on how the crate is compiled. Out of the box `&str` is always supported |
| /// but with the `bytes` feature one can also work with `[u8]` slices for |
| /// as long as they are ASCII compatible. |
| /// |
| /// Requires the `text` feature. |
| pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { |
| /// Splits the value into newlines with newlines attached. |
| fn tokenize_lines(&self) -> Vec<&Self>; |
| |
| /// Splits the value into newlines with newlines separated. |
| fn tokenize_lines_and_newlines(&self) -> Vec<&Self>; |
| |
| /// Tokenizes into words. |
| fn tokenize_words(&self) -> Vec<&Self>; |
| |
| /// Tokenizes the input into characters. |
| fn tokenize_chars(&self) -> Vec<&Self>; |
| |
| /// Tokenizes into unicode words. |
| #[cfg(feature = "unicode")] |
| fn tokenize_unicode_words(&self) -> Vec<&Self>; |
| |
| /// Tokenizes into unicode graphemes. |
| #[cfg(feature = "unicode")] |
| fn tokenize_graphemes(&self) -> Vec<&Self>; |
| |
| /// Decodes the string (potentially) lossy. |
| fn as_str(&self) -> Option<&str>; |
| |
| /// Decodes the string (potentially) lossy. |
| fn to_string_lossy(&self) -> Cow<'_, str>; |
| |
| /// Checks if the string ends in a newline. |
| fn ends_with_newline(&self) -> bool; |
| |
| /// The length of the string. |
| fn len(&self) -> usize; |
| |
| /// Slices the string. |
| fn slice(&self, rng: Range<usize>) -> &Self; |
| |
| /// Returns the string as slice of raw bytes. |
| fn as_bytes(&self) -> &[u8]; |
| |
| /// Checks if the string is empty. |
| fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| } |
| |
| impl DiffableStr for str { |
| fn tokenize_lines(&self) -> Vec<&Self> { |
| let mut iter = self.char_indices().peekable(); |
| let mut last_pos = 0; |
| let mut lines = vec![]; |
| |
| while let Some((idx, c)) = iter.next() { |
| if c == '\r' { |
| if iter.peek().map_or(false, |x| x.1 == '\n') { |
| lines.push(&self[last_pos..=idx + 1]); |
| iter.next(); |
| last_pos = idx + 2; |
| } else { |
| lines.push(&self[last_pos..=idx]); |
| last_pos = idx + 1; |
| } |
| } else if c == '\n' { |
| lines.push(&self[last_pos..=idx]); |
| last_pos = idx + 1; |
| } |
| } |
| |
| if last_pos < self.len() { |
| lines.push(&self[last_pos..]); |
| } |
| |
| lines |
| } |
| |
| fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { |
| let mut rv = vec![]; |
| let mut iter = self.char_indices().peekable(); |
| |
| while let Some((idx, c)) = iter.next() { |
| let is_newline = c == '\r' || c == '\n'; |
| let start = idx; |
| let mut end = idx + c.len_utf8(); |
| while let Some(&(_, next_char)) = iter.peek() { |
| if (next_char == '\r' || next_char == '\n') != is_newline { |
| break; |
| } |
| iter.next(); |
| end += next_char.len_utf8(); |
| } |
| rv.push(&self[start..end]); |
| } |
| |
| rv |
| } |
| |
| fn tokenize_words(&self) -> Vec<&Self> { |
| let mut iter = self.char_indices().peekable(); |
| let mut rv = vec![]; |
| |
| while let Some((idx, c)) = iter.next() { |
| let is_whitespace = c.is_whitespace(); |
| let start = idx; |
| let mut end = idx + c.len_utf8(); |
| while let Some(&(_, next_char)) = iter.peek() { |
| if next_char.is_whitespace() != is_whitespace { |
| break; |
| } |
| iter.next(); |
| end += next_char.len_utf8(); |
| } |
| rv.push(&self[start..end]); |
| } |
| |
| rv |
| } |
| |
| fn tokenize_chars(&self) -> Vec<&Self> { |
| self.char_indices() |
| .map(move |(i, c)| &self[i..i + c.len_utf8()]) |
| .collect() |
| } |
| |
| #[cfg(feature = "unicode")] |
| fn tokenize_unicode_words(&self) -> Vec<&Self> { |
| unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect() |
| } |
| |
| #[cfg(feature = "unicode")] |
| fn tokenize_graphemes(&self) -> Vec<&Self> { |
| unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect() |
| } |
| |
| fn as_str(&self) -> Option<&str> { |
| Some(self) |
| } |
| |
| fn to_string_lossy(&self) -> Cow<'_, str> { |
| Cow::Borrowed(self) |
| } |
| |
| fn ends_with_newline(&self) -> bool { |
| self.ends_with(&['\r', '\n'][..]) |
| } |
| |
| fn len(&self) -> usize { |
| str::len(self) |
| } |
| |
| fn slice(&self, rng: Range<usize>) -> &Self { |
| &self[rng] |
| } |
| |
| fn as_bytes(&self) -> &[u8] { |
| str::as_bytes(self) |
| } |
| } |
| |
| #[cfg(feature = "bytes")] |
| mod bytes_support { |
| use super::*; |
| |
| use bstr::ByteSlice; |
| |
| impl DiffableStrRef for Vec<u8> { |
| type Output = [u8]; |
| |
| fn as_diffable_str(&self) -> &[u8] { |
| self.as_slice() |
| } |
| } |
| |
| /// Allows viewing ASCII compatible byte slices as strings. |
| /// |
| /// Requires the `bytes` feature. |
| impl DiffableStr for [u8] { |
| fn tokenize_lines(&self) -> Vec<&Self> { |
| let mut iter = self.char_indices().peekable(); |
| let mut last_pos = 0; |
| let mut lines = vec![]; |
| |
| while let Some((_, end, c)) = iter.next() { |
| if c == '\r' { |
| if iter.peek().map_or(false, |x| x.2 == '\n') { |
| lines.push(&self[last_pos..end + 1]); |
| iter.next(); |
| last_pos = end + 1; |
| } else { |
| lines.push(&self[last_pos..end]); |
| last_pos = end; |
| } |
| } else if c == '\n' { |
| lines.push(&self[last_pos..end]); |
| last_pos = end; |
| } |
| } |
| |
| if last_pos < self.len() { |
| lines.push(&self[last_pos..]); |
| } |
| |
| lines |
| } |
| |
| fn tokenize_lines_and_newlines(&self) -> Vec<&Self> { |
| let mut rv = vec![]; |
| let mut iter = self.char_indices().peekable(); |
| |
| while let Some((start, mut end, c)) = iter.next() { |
| let is_newline = c == '\r' || c == '\n'; |
| while let Some(&(_, new_end, next_char)) = iter.peek() { |
| if (next_char == '\r' || next_char == '\n') != is_newline { |
| break; |
| } |
| iter.next(); |
| end = new_end; |
| } |
| rv.push(&self[start..end]); |
| } |
| |
| rv |
| } |
| |
| fn tokenize_words(&self) -> Vec<&Self> { |
| let mut iter = self.char_indices().peekable(); |
| let mut rv = vec![]; |
| |
| while let Some((start, mut end, c)) = iter.next() { |
| let is_whitespace = c.is_whitespace(); |
| while let Some(&(_, new_end, next_char)) = iter.peek() { |
| if next_char.is_whitespace() != is_whitespace { |
| break; |
| } |
| iter.next(); |
| end = new_end; |
| } |
| rv.push(&self[start..end]); |
| } |
| |
| rv |
| } |
| |
| #[cfg(feature = "unicode")] |
| fn tokenize_unicode_words(&self) -> Vec<&Self> { |
| self.words_with_breaks().map(|x| x.as_bytes()).collect() |
| } |
| |
| #[cfg(feature = "unicode")] |
| fn tokenize_graphemes(&self) -> Vec<&Self> { |
| self.graphemes().map(|x| x.as_bytes()).collect() |
| } |
| |
| fn tokenize_chars(&self) -> Vec<&Self> { |
| self.char_indices() |
| .map(move |(start, end, _)| &self[start..end]) |
| .collect() |
| } |
| |
| fn as_str(&self) -> Option<&str> { |
| std::str::from_utf8(self).ok() |
| } |
| |
| fn to_string_lossy(&self) -> Cow<'_, str> { |
| String::from_utf8_lossy(self) |
| } |
| |
| fn ends_with_newline(&self) -> bool { |
| if let Some(b'\r') | Some(b'\n') = self.last_byte() { |
| true |
| } else { |
| false |
| } |
| } |
| |
| fn len(&self) -> usize { |
| <[u8]>::len(self) |
| } |
| |
| fn slice(&self, rng: Range<usize>) -> &Self { |
| &self[rng] |
| } |
| |
| fn as_bytes(&self) -> &[u8] { |
| self |
| } |
| } |
| } |
| |
| #[test] |
| fn test_split_lines() { |
| assert_eq!( |
| DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"), |
| vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"] |
| ); |
| assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]); |
| assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]); |
| assert!(DiffableStr::tokenize_lines("").is_empty()); |
| } |
| |
| #[test] |
| fn test_split_words() { |
| assert_eq!( |
| DiffableStr::tokenize_words("foo bar baz\n\n aha"), |
| ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"] |
| ); |
| } |
| |
| #[test] |
| fn test_split_chars() { |
| assert_eq!( |
| DiffableStr::tokenize_chars("abcfö❄️"), |
| vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"] |
| ); |
| } |
| |
| #[test] |
| #[cfg(feature = "unicode")] |
| fn test_split_graphemes() { |
| assert_eq!( |
| DiffableStr::tokenize_graphemes("abcfö❄️"), |
| vec!["a", "b", "c", "f", "ö", "❄️"] |
| ); |
| } |
| |
| #[test] |
| #[cfg(feature = "bytes")] |
| fn test_split_lines_bytes() { |
| assert_eq!( |
| DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()), |
| vec![ |
| "first\n".as_bytes(), |
| "second\r".as_bytes(), |
| "third\r\n".as_bytes(), |
| "fourth\n".as_bytes(), |
| "last".as_bytes() |
| ] |
| ); |
| assert_eq!( |
| DiffableStr::tokenize_lines("\n\n".as_bytes()), |
| vec!["\n".as_bytes(), "\n".as_bytes()] |
| ); |
| assert_eq!( |
| DiffableStr::tokenize_lines("\n".as_bytes()), |
| vec!["\n".as_bytes()] |
| ); |
| assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty()); |
| } |
| |
| #[test] |
| #[cfg(feature = "bytes")] |
| fn test_split_words_bytes() { |
| assert_eq!( |
| DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()), |
| [ |
| &b"foo"[..], |
| &b" "[..], |
| &b"bar"[..], |
| &b" "[..], |
| &b"baz"[..], |
| &b"\n\n "[..], |
| &b"aha"[..] |
| ] |
| ); |
| } |
| |
| #[test] |
| #[cfg(feature = "bytes")] |
| fn test_split_chars_bytes() { |
| assert_eq!( |
| DiffableStr::tokenize_chars("abcfö❄️".as_bytes()), |
| vec![ |
| &b"a"[..], |
| &b"b"[..], |
| &b"c"[..], |
| &b"f"[..], |
| "ö".as_bytes(), |
| "❄".as_bytes(), |
| "\u{fe0f}".as_bytes() |
| ] |
| ); |
| } |
| |
| #[test] |
| #[cfg(all(feature = "bytes", feature = "unicode"))] |
| fn test_split_graphemes_bytes() { |
| assert_eq!( |
| DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()), |
| vec![ |
| &b"a"[..], |
| &b"b"[..], |
| &b"c"[..], |
| &b"f"[..], |
| "ö".as_bytes(), |
| "❄️".as_bytes() |
| ] |
| ); |
| } |