| //! Functionality for finding words. |
| //! |
| //! In order to wrap text, we need to know where the legal break |
| //! points are, i.e., where the words of the text are. This means that |
| //! we need to define what a "word" is. |
| //! |
| //! A simple approach is to simply split the text on whitespace, but |
| //! this does not work for East-Asian languages such as Chinese or |
| //! Japanese where there are no spaces between words. Breaking a long |
| //! sequence of emojis is another example where line breaks might be |
| //! wanted even if there are no whitespace to be found. |
| //! |
| //! The [`WordSeparator`] trait is responsible for determining where |
| //! there words are in a line of text. Please refer to the trait and |
| //! the structs which implement it for more information. |
| |
| #[cfg(feature = "unicode-linebreak")] |
| use crate::core::skip_ansi_escape_sequence; |
| use crate::core::Word; |
| |
| /// Describes where words occur in a line of text. |
| /// |
| /// The simplest approach is say that words are separated by one or |
| /// more ASCII spaces (`' '`). This works for Western languages |
| /// without emojis. A more complex approach is to use the Unicode line |
| /// breaking algorithm, which finds break points in non-ASCII text. |
| /// |
| /// The line breaks occur between words, please see |
| /// [`WordSplitter`](crate::WordSplitter) for options of how to handle |
| /// hyphenation of individual words. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use textwrap::core::Word; |
| /// use textwrap::WordSeparator::AsciiSpace; |
| /// |
| /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); |
| /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]); |
| /// ``` |
| #[derive(Clone, Copy)] |
| pub enum WordSeparator { |
| /// Find words by splitting on runs of `' '` characters. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use textwrap::core::Word; |
| /// use textwrap::WordSeparator::AsciiSpace; |
| /// |
| /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>(); |
| /// assert_eq!(words, vec![Word::from("Hello "), |
| /// Word::from("World!")]); |
| /// ``` |
| AsciiSpace, |
| |
| /// Split `line` into words using Unicode break properties. |
| /// |
| /// This word separator uses the Unicode line breaking algorithm |
| /// described in [Unicode Standard Annex |
| /// #14](https://www.unicode.org/reports/tr14/) to find legal places |
| /// to break lines. There is a small difference in that the U+002D |
| /// (Hyphen-Minus) and U+00AD (Soft Hyphen) donβt create a line break: |
| /// to allow a line break at a hyphen, use |
| /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter). |
| /// Soft hyphens are not currently supported. |
| /// |
| /// # Examples |
| /// |
| /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line |
| /// breaking algorithm will find line break opportunities between |
| /// some characters with no intervening whitespace: |
| /// |
| /// ``` |
| /// #[cfg(feature = "unicode-linebreak")] { |
| /// use textwrap::core::Word; |
| /// use textwrap::WordSeparator::UnicodeBreakProperties; |
| /// |
| /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ππ").collect::<Vec<_>>(), |
| /// vec![Word::from("Emojis: "), |
| /// Word::from("π"), |
| /// Word::from("π")]); |
| /// |
| /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(), |
| /// vec![Word::from("CJK: "), |
| /// Word::from("δ½ "), |
| /// Word::from("ε₯½")]); |
| /// } |
| /// ``` |
| /// |
| /// A U+2060 (Word Joiner) character can be inserted if you want to |
| /// manually override the defaults and keep the characters together: |
| /// |
| /// ``` |
| /// #[cfg(feature = "unicode-linebreak")] { |
| /// use textwrap::core::Word; |
| /// use textwrap::WordSeparator::UnicodeBreakProperties; |
| /// |
| /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: π\u{2060}π").collect::<Vec<_>>(), |
| /// vec![Word::from("Emojis: "), |
| /// Word::from("π\u{2060}π")]); |
| /// } |
| /// ``` |
| /// |
| /// The Unicode line breaking algorithm will also automatically |
| /// suppress break breaks around certain punctuation characters:: |
| /// |
| /// ``` |
| /// #[cfg(feature = "unicode-linebreak")] { |
| /// use textwrap::core::Word; |
| /// use textwrap::WordSeparator::UnicodeBreakProperties; |
| /// |
| /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(), |
| /// vec![Word::from("[ foo ] "), |
| /// Word::from("bar !")]); |
| /// } |
| /// ``` |
| #[cfg(feature = "unicode-linebreak")] |
| UnicodeBreakProperties, |
| |
| /// Find words using a custom word separator |
| Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>), |
| } |
| |
| impl PartialEq for WordSeparator { |
| /// Compare two word separators. |
| /// |
| /// ``` |
| /// use textwrap::WordSeparator; |
| /// |
| /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace); |
| /// #[cfg(feature = "unicode-linebreak")] { |
| /// assert_eq!(WordSeparator::UnicodeBreakProperties, |
| /// WordSeparator::UnicodeBreakProperties); |
| /// } |
| /// ``` |
| /// |
| /// Note that `WordSeparator::Custom` values never compare equal: |
| /// |
| /// ``` |
| /// use textwrap::WordSeparator; |
| /// use textwrap::core::Word; |
| /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> { |
| /// Box::new(line.split_inclusive(' ').map(Word::from)) |
| /// } |
| /// assert_ne!(WordSeparator::Custom(word_separator), |
| /// WordSeparator::Custom(word_separator)); |
| /// ``` |
| fn eq(&self, other: &Self) -> bool { |
| match (self, other) { |
| (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true, |
| #[cfg(feature = "unicode-linebreak")] |
| (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true, |
| (_, _) => false, |
| } |
| } |
| } |
| |
| impl std::fmt::Debug for WordSeparator { |
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| match self { |
| WordSeparator::AsciiSpace => f.write_str("AsciiSpace"), |
| #[cfg(feature = "unicode-linebreak")] |
| WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"), |
| WordSeparator::Custom(_) => f.write_str("Custom(...)"), |
| } |
| } |
| } |
| |
| impl WordSeparator { |
| /// Create a new word separator. |
| /// |
| /// The best available algorithm is used by default, i.e., |
| /// [`WordSeparator::UnicodeBreakProperties`] if available, |
| /// otherwise [`WordSeparator::AsciiSpace`]. |
| pub const fn new() -> Self { |
| #[cfg(feature = "unicode-linebreak")] |
| { |
| WordSeparator::UnicodeBreakProperties |
| } |
| |
| #[cfg(not(feature = "unicode-linebreak"))] |
| { |
| WordSeparator::AsciiSpace |
| } |
| } |
| |
| // This function should really return impl Iterator<Item = Word>, but |
| // this isn't possible until Rust supports higher-kinded types: |
| // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md |
| /// Find all words in `line`. |
| pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { |
| match self { |
| WordSeparator::AsciiSpace => find_words_ascii_space(line), |
| #[cfg(feature = "unicode-linebreak")] |
| WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line), |
| WordSeparator::Custom(func) => func(line), |
| } |
| } |
| } |
| |
| fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { |
| let mut start = 0; |
| let mut in_whitespace = false; |
| let mut char_indices = line.char_indices(); |
| |
| Box::new(std::iter::from_fn(move || { |
| for (idx, ch) in char_indices.by_ref() { |
| if in_whitespace && ch != ' ' { |
| let word = Word::from(&line[start..idx]); |
| start = idx; |
| in_whitespace = ch == ' '; |
| return Some(word); |
| } |
| |
| in_whitespace = ch == ' '; |
| } |
| |
| if start < line.len() { |
| let word = Word::from(&line[start..]); |
| start = line.len(); |
| return Some(word); |
| } |
| |
| None |
| })) |
| } |
| |
| // Strip all ANSI escape sequences from `text`. |
| #[cfg(feature = "unicode-linebreak")] |
| fn strip_ansi_escape_sequences(text: &str) -> String { |
| let mut result = String::with_capacity(text.len()); |
| |
| let mut chars = text.chars(); |
| while let Some(ch) = chars.next() { |
| if skip_ansi_escape_sequence(ch, &mut chars) { |
| continue; |
| } |
| result.push(ch); |
| } |
| |
| result |
| } |
| |
| /// Soft hyphen, also knows as a βshy hyphenβ. Should show up as β-β |
| /// if a line is broken at this point, and otherwise be invisible. |
| /// Textwrap does not currently support breaking words at soft |
| /// hyphens. |
| #[cfg(feature = "unicode-linebreak")] |
| const SHY: char = '\u{00ad}'; |
| |
| /// Find words in line. ANSI escape sequences are ignored in `line`. |
| #[cfg(feature = "unicode-linebreak")] |
| fn find_words_unicode_break_properties<'a>( |
| line: &'a str, |
| ) -> Box<dyn Iterator<Item = Word<'a>> + 'a> { |
| // Construct an iterator over (original index, stripped index) |
| // tuples. We find the Unicode linebreaks on a stripped string, |
| // but we need the original indices so we can form words based on |
| // the original string. |
| let mut last_stripped_idx = 0; |
| let mut char_indices = line.char_indices(); |
| let mut idx_map = std::iter::from_fn(move || match char_indices.next() { |
| Some((orig_idx, ch)) => { |
| let stripped_idx = last_stripped_idx; |
| if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) { |
| last_stripped_idx += ch.len_utf8(); |
| } |
| Some((orig_idx, stripped_idx)) |
| } |
| None => None, |
| }); |
| |
| let stripped = strip_ansi_escape_sequences(line); |
| let mut opportunities = unicode_linebreak::linebreaks(&stripped) |
| .filter(|(idx, _)| { |
| #[allow(clippy::match_like_matches_macro)] |
| match &stripped[..*idx].chars().next_back() { |
| // We suppress breaks at β-β since we want to control |
| // this via the WordSplitter. |
| Some('-') => false, |
| // Soft hyphens are currently not supported since we |
| // require all `Word` fragments to be continuous in |
| // the input string. |
| Some(SHY) => false, |
| // Other breaks should be fine! |
| _ => true, |
| } |
| }) |
| .collect::<Vec<_>>() |
| .into_iter(); |
| |
| // Remove final break opportunity, we will add it below using |
| // &line[start..]; This ensures that we correctly include a |
| // trailing ANSI escape sequence. |
| opportunities.next_back(); |
| |
| let mut start = 0; |
| Box::new(std::iter::from_fn(move || { |
| for (idx, _) in opportunities.by_ref() { |
| if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) { |
| let word = Word::from(&line[start..orig_idx]); |
| start = orig_idx; |
| return Some(word); |
| } |
| } |
| |
| if start < line.len() { |
| let word = Word::from(&line[start..]); |
| start = line.len(); |
| return Some(word); |
| } |
| |
| None |
| })) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::WordSeparator::*; |
| use super::*; |
| |
| // Like assert_eq!, but the left expression is an iterator. |
| macro_rules! assert_iter_eq { |
| ($left:expr, $right:expr) => { |
| assert_eq!($left.collect::<Vec<_>>(), $right); |
| }; |
| } |
| |
| fn to_words(words: Vec<&str>) -> Vec<Word<'_>> { |
| words.into_iter().map(Word::from).collect() |
| } |
| |
| macro_rules! test_find_words { |
| ($ascii_name:ident, |
| $unicode_name:ident, |
| $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => { |
| #[test] |
| fn $ascii_name() { |
| $( |
| let expected_words = to_words($ascii_words.to_vec()); |
| let actual_words = WordSeparator::AsciiSpace |
| .find_words($line) |
| .collect::<Vec<_>>(); |
| assert_eq!(actual_words, expected_words, "Line: {:?}", $line); |
| )+ |
| } |
| |
| #[test] |
| #[cfg(feature = "unicode-linebreak")] |
| fn $unicode_name() { |
| $( |
| let expected_words = to_words($unicode_words.to_vec()); |
| let actual_words = WordSeparator::UnicodeBreakProperties |
| .find_words($line) |
| .collect::<Vec<_>>(); |
| assert_eq!(actual_words, expected_words, "Line: {:?}", $line); |
| )+ |
| } |
| }; |
| } |
| |
| test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]); |
| |
| test_find_words!( |
| ascii_single_word, |
| unicode_single_word, |
| ["foo", ["foo"], ["foo"]] |
| ); |
| |
| test_find_words!( |
| ascii_two_words, |
| unicode_two_words, |
| ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] |
| ); |
| |
| test_find_words!( |
| ascii_multiple_words, |
| unicode_multiple_words, |
| ["foo bar", ["foo ", "bar"], ["foo ", "bar"]], |
| ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]] |
| ); |
| |
| test_find_words!( |
| ascii_only_whitespace, |
| unicode_only_whitespace, |
| [" ", [" "], [" "]], |
| [" ", [" "], [" "]] |
| ); |
| |
| test_find_words!( |
| ascii_inter_word_whitespace, |
| unicode_inter_word_whitespace, |
| ["foo bar", ["foo ", "bar"], ["foo ", "bar"]] |
| ); |
| |
| test_find_words!( |
| ascii_trailing_whitespace, |
| unicode_trailing_whitespace, |
| ["foo ", ["foo "], ["foo "]] |
| ); |
| |
| test_find_words!( |
| ascii_leading_whitespace, |
| unicode_leading_whitespace, |
| [" foo", [" ", "foo"], [" ", "foo"]] |
| ); |
| |
| test_find_words!( |
| ascii_multi_column_char, |
| unicode_multi_column_char, |
| ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji π€ |
| ); |
| |
| test_find_words!( |
| ascii_hyphens, |
| unicode_hyphens, |
| ["foo-bar", ["foo-bar"], ["foo-bar"]], |
| ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]], |
| ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]], |
| ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]] |
| ); |
| |
| test_find_words!( |
| ascii_newline, |
| unicode_newline, |
| ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]] |
| ); |
| |
| test_find_words!( |
| ascii_tab, |
| unicode_tab, |
| ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]] |
| ); |
| |
| test_find_words!( |
| ascii_non_breaking_space, |
| unicode_non_breaking_space, |
| ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]] |
| ); |
| |
| #[test] |
| #[cfg(unix)] |
| fn find_words_colored_text() { |
| use termion::color::{Blue, Fg, Green, Reset}; |
| |
| let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset)); |
| let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset)); |
| assert_iter_eq!( |
| AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)), |
| vec![Word::from(&green_hello), Word::from(&blue_world)] |
| ); |
| |
| #[cfg(feature = "unicode-linebreak")] |
| assert_iter_eq!( |
| UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)), |
| vec![Word::from(&green_hello), Word::from(&blue_world)] |
| ); |
| } |
| |
| #[test] |
| fn find_words_color_inside_word() { |
| let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz"; |
| assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]); |
| |
| #[cfg(feature = "unicode-linebreak")] |
| assert_iter_eq!( |
| UnicodeBreakProperties.find_words(text), |
| vec![Word::from(text)] |
| ); |
| } |
| |
| #[test] |
| fn word_separator_new() { |
| #[cfg(feature = "unicode-linebreak")] |
| assert!(matches!(WordSeparator::new(), UnicodeBreakProperties)); |
| |
| #[cfg(not(feature = "unicode-linebreak"))] |
| assert!(matches!(WordSeparator::new(), AsciiSpace)); |
| } |
| } |