src/word.rs - platform/external/rust/crates/unicode-segmentation - Git at Google

 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 use core::cmp;
 use core::iter::Filter;

 use tables::word::WordCat;

 /// An iterator over the substrings of a string which, after splitting the string on
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
 /// contain any characters with the
 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
 /// property, or with
 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
 ///
 /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
 /// its documentation for more.
 ///
 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 pub struct UnicodeWords<'a> {
     inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
 }

 impl<'a> Iterator for UnicodeWords<'a> {
     type Item = &'a str;

     #[inline]
     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
 }
 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
     #[inline]
     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
 }

 /// External iterator for a string's
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
 ///
 /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
 /// trait. See its documentation for more.
 ///
 /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 #[derive(Clone)]
 pub struct UWordBounds<'a> {
     string: &'a str,
     cat: Option<WordCat>,
     catb: Option<WordCat>,
 }

 /// External iterator for word boundaries and byte offsets.
 ///
 /// This struct is created by the [`split_word_bound_indices`] method on the
 /// [`UnicodeSegmentation`] trait. See its documentation for more.
 ///
 /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 #[derive(Clone)]
 pub struct UWordBoundIndices<'a> {
     start_offset: usize,
     iter: UWordBounds<'a>,
 }

 impl<'a> UWordBoundIndices<'a> {
     #[inline]
     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
     ///
     /// ```rust
     /// # use unicode_segmentation::UnicodeSegmentation;
     /// let mut iter = "Hello world".split_word_bound_indices();
     /// assert_eq!(iter.as_str(), "Hello world");
     /// iter.next();
     /// assert_eq!(iter.as_str(), " world");
     /// iter.next();
     /// assert_eq!(iter.as_str(), "world");
     /// ```
     pub fn as_str(&self) -> &'a str {
         self.iter.as_str()
     }
 }

 impl<'a> Iterator for UWordBoundIndices<'a> {
     type Item = (usize, &'a str);

     #[inline]
     fn next(&mut self) -> Option<(usize, &'a str)> {
         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
     }

     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         self.iter.size_hint()
     }
 }

 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
     #[inline]
     fn next_back(&mut self) -> Option<(usize, &'a str)> {
         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
     }
 }

 // state machine for word boundary rules
 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
 enum UWordBoundsState {
     Start,
     Letter,
     HLetter,
     Numeric,
     Katakana,
     ExtendNumLet,
     Regional(RegionalState),
     FormatExtend(FormatExtendType),
     Zwj,
     Emoji,
     WSegSpace,
 }

 // subtypes for FormatExtend state in UWordBoundsState
 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
 enum FormatExtendType {
     AcceptAny,
     AcceptNone,
     RequireLetter,
     RequireHLetter,
     AcceptQLetter,
     RequireNumeric,
 }

 #[derive(Clone,Copy,PartialEq,Eq,Debug)]
 enum RegionalState {
     Half,
     Full,
     Unknown,
 }

 fn is_emoji(ch: char) -> bool {
     use tables::emoji;
     emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
 }

 impl<'a> Iterator for UWordBounds<'a> {
     type Item = &'a str;

     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         let slen = self.string.len();
         (cmp::min(slen, 1), Some(slen))
     }

     #[inline]
     fn next(&mut self) -> Option<&'a str> {
         use self::UWordBoundsState::*;
         use self::FormatExtendType::*;
         use tables::word as wd;
         if self.string.len() == 0 {
             return None;
         }

         let mut take_curr = true;
         let mut take_cat = true;
         let mut idx = 0;
         let mut saveidx = 0;
         let mut state = Start;
         let mut cat = wd::WC_Any;
         let mut savecat = wd::WC_Any;

         // Whether or not the previous category was ZWJ
         // ZWJs get collapsed, so this handles precedence of WB3c over WB4
         let mut prev_zwj;
         // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
         let mut skipped_format_extend = false;
         for (curr, ch) in self.string.char_indices() {
             idx = curr;
             prev_zwj = cat == wd::WC_ZWJ;
             // if there's a category cached, grab it
             cat = match self.cat {
                 None => wd::word_category(ch).2,
                 _ => self.cat.take().unwrap()
             };
             take_cat = true;

             // handle rule WB4
             // just skip all format, extend, and zwj chars
             // note that Start is a special case: if there's a bunch of Format | Extend
             // characters at the beginning of a block of text, dump them out as one unit.
             //
             // (This is not obvious from the wording of UAX#29, but if you look at the
             // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
             // then the "correct" interpretation of WB4 becomes apparent.)
             if state != Start {
                 match cat {
                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
                         skipped_format_extend = true;
                         continue
                     }
                     _ => {}
                 }
             }

             // rule WB3c
             // WB4 makes all ZWJs collapse into the previous state
             // but you can still be in a Zwj state if you started with Zwj
             //
             // This means that an EP + Zwj will collapse into EP, which is wrong,
             // since EP+EP is not a boundary but EP+ZWJ+EP is
             //
             // Thus, we separately keep track of whether or not the last character
             // was a ZWJ. This is an additional bit of state tracked outside of the
             // state enum; the state enum represents the last non-zwj state encountered.
             // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
             // however we are in the previous state for the purposes of all other rules.
             if prev_zwj {
                 if is_emoji(ch) {
                     state = Emoji;
                     continue;
                 }
             }
             // Don't use `continue` in this match without updating `cat`
             state = match state {
                 Start if cat == wd::WC_CR => {
                     idx += match self.get_next_cat(idx) {
                         Some(ncat) if ncat == wd::WC_LF => 1,       // rule WB3
                         _ => 0
                     };
                     break;                                          // rule WB3a
                 },
                 Start => match cat {
                     wd::WC_ALetter => Letter,           // rule WB5, WB6, WB9, WB13a
                     wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
                     wd::WC_Numeric => Numeric,          // rule WB8, WB10, WB12, WB13a
                     wd::WC_Katakana => Katakana,        // rule WB13, WB13a
                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a, WB13b
                     wd::WC_Regional_Indicator => Regional(RegionalState::Half),  // rule WB13c
                     wd::WC_LF | wd::WC_Newline => break,    // rule WB3a
                     wd::WC_ZWJ => Zwj,                      // rule WB3c
                     wd::WC_WSegSpace => WSegSpace,          // rule WB3d
                     _ => {
                         if let Some(ncat) = self.get_next_cat(idx) {                // rule WB4
                             if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
                                 state = FormatExtend(AcceptNone);
                                 self.cat = Some(ncat);
                                 continue;
                             }
                         }
                         break;                                                      // rule WB999
                     }
                 },
                 WSegSpace => match cat {
                     wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Zwj => {
                     // We already handle WB3c above.
                     take_curr = false;
                     break;
                 }
                 Letter | HLetter => match cat {
                     wd::WC_ALetter => Letter,                   // rule WB5
                     wd::WC_Hebrew_Letter => HLetter,            // rule WB5
                     wd::WC_Numeric => Numeric,                  // rule WB9
                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
                     wd::WC_Double_Quote if state == HLetter => {
                         savecat = cat;
                         saveidx = idx;
                         FormatExtend(RequireHLetter)                        // rule WB7b
                     },
                     wd::WC_Single_Quote if state == HLetter => {
                         FormatExtend(AcceptQLetter)                         // rule WB7a
                     },
                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
                         savecat = cat;
                         saveidx = idx;
                         FormatExtend(RequireLetter)                         // rule WB6
                     },
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Numeric => match cat {
                     wd::WC_Numeric => Numeric,                  // rule WB8
                     wd::WC_ALetter => Letter,                   // rule WB10
                     wd::WC_Hebrew_Letter => HLetter,            // rule WB10
                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
                         savecat = cat;
                         saveidx = idx;
                         FormatExtend(RequireNumeric)            // rule WB12
                     },
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Katakana => match cat {
                     wd::WC_Katakana => Katakana,                // rule WB13
                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 ExtendNumLet => match cat {
                     wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
                     wd::WC_ALetter => Letter,                   // rule WB13b
                     wd::WC_Hebrew_Letter => HLetter,            // rule WB13b
                     wd::WC_Numeric => Numeric,                  // rule WB13b
                     wd::WC_Katakana => Katakana,                // rule WB13b
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Regional(RegionalState::Full) => {
                     // if it reaches here we've gone too far,
                     // a full flag can only compose with ZWJ/Extend/Format
                     // proceeding it.
                     take_curr = false;
                     break;
                 }
                 Regional(RegionalState::Half) => match cat {
                     wd::WC_Regional_Indicator => Regional(RegionalState::Full),      // rule WB13c
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
                 Emoji => {
                     // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
                     take_curr = false;
                     break;
                 },
                 FormatExtend(t) => match t {    // handle FormatExtends depending on what type
                     RequireNumeric if cat == wd::WC_Numeric => Numeric,     // rule WB11
                     RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter,   // rule WB7
                     RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,   // rule WB7b
                     AcceptNone | AcceptQLetter => {
                         take_curr = false;  // emit all the Format|Extend characters
                         take_cat = false;
                         break;
                     },
                     _ => break      // rewind (in if statement below)
                 }
             }
         }

         if let FormatExtend(t) = state {
             // we were looking for something and didn't find it; we have to back up
             if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
                 idx = saveidx;
                 cat = savecat;
                 take_curr = false;
             }
         }

         self.cat = if take_curr {
             idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
             None
         } else if take_cat {
             Some(cat)
         } else {
             None
         };

         let retstr = &self.string[..idx];
         self.string = &self.string[idx..];
         Some(retstr)
     }
 }

 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
     #[inline]
     fn next_back(&mut self) -> Option<&'a str> {
         use self::UWordBoundsState::*;
         use self::FormatExtendType::*;
         use tables::word as wd;
         if self.string.len() == 0 {
             return None;
         }

         let mut take_curr = true;
         let mut take_cat = true;
         let mut idx = self.string.len();
         idx -= self.string.chars().next_back().unwrap().len_utf8();
         let mut previdx = idx;
         let mut saveidx = idx;
         let mut state = Start;
         let mut savestate = Start;
         let mut cat = wd::WC_Any;

         let mut skipped_format_extend = false;

         for (curr, ch) in self.string.char_indices().rev() {
             previdx = idx;
             idx = curr;

             // if there's a category cached, grab it
             cat = match self.catb {
                 None => wd::word_category(ch).2,
                 _ => self.catb.take().unwrap()
             };
             take_cat = true;

             // backward iterator over word boundaries. Mostly the same as the forward
             // iterator, with two weirdnesses:
             // (1) If we encounter a single quote in the Start state, we have to check for a
             //     Hebrew Letter immediately before it.
             // (2) Format and Extend char handling takes some gymnastics.

             if cat == wd::WC_Extend
                 || cat == wd::WC_Format
                 || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
                                                          // fold in that case
                 if match state {
                     FormatExtend(_) | Start => false,
                     _ => true
                 } {
                     saveidx = previdx;
                     savestate = state;
                     state = FormatExtend(AcceptNone);
                 }

                 if state != Start {
                     continue;
                 }
             } else if state == FormatExtend(AcceptNone) {
                 // finished a scan of some Format|Extend chars, restore previous state
                 state = savestate;
                 previdx = saveidx;
                 take_cat = false;
                 skipped_format_extend = true;
             }

             // Don't use `continue` in this match without updating `catb`
             state = match state {
                 Start | FormatExtend(AcceptAny) => match cat {
                     _ if is_emoji(ch) => Zwj,
                     wd::WC_ALetter => Letter,           // rule WB5, WB7, WB10, WB13b
                     wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB7, WB7c, WB10, WB13b
                     wd::WC_Numeric => Numeric,          // rule WB8, WB9, WB11, WB13b
                     wd::WC_Katakana => Katakana,                    // rule WB13, WB13b
                     wd::WC_ExtendNumLet => ExtendNumLet,                    // rule WB13a
                     wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
                     // rule WB4:
                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
                     wd::WC_Single_Quote => {
                         saveidx = idx;
                         FormatExtend(AcceptQLetter)                         // rule WB7a
                     },
                     wd::WC_WSegSpace => WSegSpace,
                     wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
                         if state == Start {
                             if cat == wd::WC_LF {
                                 idx -= match self.get_prev_cat(idx) {
                                     Some(pcat) if pcat == wd::WC_CR => 1,   // rule WB3
                                     _ => 0
                                 };
                             }
                         } else {
                             take_curr = false;
                         }
                         break;                                              // rule WB3a
                     },
                     _ => break                              // rule WB999
                 },
                 Zwj => match cat {                          // rule WB3c
                     wd::WC_ZWJ => {
                         FormatExtend(AcceptAny)
                     }
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 WSegSpace => match cat {                          // rule WB3d
                     wd::WC_WSegSpace if !skipped_format_extend => {
                         WSegSpace
                     }
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Letter | HLetter => match cat {
                     wd::WC_ALetter => Letter,               // rule WB5
                     wd::WC_Hebrew_Letter => HLetter,        // rule WB5
                     wd::WC_Numeric => Numeric,              // rule WB10
                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
                     wd::WC_Double_Quote if state == HLetter => {
                         saveidx = previdx;
                         FormatExtend(RequireHLetter)         // rule WB7c
                     },
                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
                         saveidx = previdx;
                         FormatExtend(RequireLetter)          // rule WB7
                     },
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Numeric => match cat {
                     wd::WC_Numeric => Numeric,              // rule WB8
                     wd::WC_ALetter => Letter,               // rule WB9
                     wd::WC_Hebrew_Letter => HLetter,        // rule WB9
                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
                         saveidx = previdx;
                         FormatExtend(RequireNumeric)         // rule WB11
                     },
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Katakana => match cat {
                     wd::WC_Katakana => Katakana,            // rule WB13
                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 ExtendNumLet => match cat {
                     wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a
                     wd::WC_ALetter => Letter,               // rule WB13a
                     wd::WC_Hebrew_Letter => HLetter,        // rule WB13a
                     wd::WC_Numeric => Numeric,              // rule WB13a
                     wd::WC_Katakana => Katakana,            // rule WB13a
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Regional(mut regional_state) => match cat {
                     // rule WB13c
                     wd::WC_Regional_Indicator => {
                         if regional_state == RegionalState::Unknown {
                             let count = self.string[..previdx]
                                             .chars().rev()
                                             .map(|c| wd::word_category(c).2)
                                             .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
                                             .take_while(|&c| c == wd::WC_Regional_Indicator)
                                             .count();
                             regional_state = if count % 2 == 0 {
                                 RegionalState::Full
                             } else {
                                 RegionalState::Half
                             };
                         }
                         if regional_state == RegionalState::Full {
                             take_curr = false;
                             break;
                         } else {
                             Regional(RegionalState::Full)
                         }
                     }
                     _ => {
                         take_curr = false;
                         break;
                     }
                 },
                 Emoji => {
                     if is_emoji(ch) {           // rule WB3c
                         Zwj
                     } else {
                         take_curr = false;
                         break;
                     }
                 },
                 FormatExtend(t) => match t {
                     RequireNumeric if cat == wd::WC_Numeric => Numeric,          // rule WB12
                     RequireLetter if cat == wd::WC_ALetter => Letter,            // rule WB6
                     RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB6
                     AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB7a
                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,    // rule WB7b
                     _ => break  // backtrack will happens
                 }
             }
         }

         if let FormatExtend(t) = state {
             // if we required something but didn't find it, backtrack
             if t == RequireLetter || t == RequireHLetter ||
                 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
                 previdx = saveidx;
                 take_cat = false;
                 take_curr = false;
             }
         }

         self.catb = if take_curr {
             None
         } else {
             idx = previdx;
             if take_cat {
                 Some(cat)
             } else {
                 None
             }
         };

         let retstr = &self.string[idx..];
         self.string = &self.string[..idx];
         Some(retstr)
     }
 }

 impl<'a> UWordBounds<'a> {
     #[inline]
     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
     ///
     /// ```rust
     /// # use unicode_segmentation::UnicodeSegmentation;
     /// let mut iter = "Hello world".split_word_bounds();
     /// assert_eq!(iter.as_str(), "Hello world");
     /// iter.next();
     /// assert_eq!(iter.as_str(), " world");
     /// iter.next();
     /// assert_eq!(iter.as_str(), "world");
     /// ```
     pub fn as_str(&self) -> &'a str {
         self.string
     }

     #[inline]
     fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
         use tables::word as wd;
         let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
         if nidx < self.string.len() {
             let nch = self.string[nidx..].chars().next().unwrap();
             Some(wd::word_category(nch).2)
         } else {
             None
         }
     }

     #[inline]
     fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
         use tables::word as wd;
         if idx > 0 {
             let nch = self.string[..idx].chars().next_back().unwrap();
             Some(wd::word_category(nch).2)
         } else {
             None
         }
     }
 }

 #[inline]
 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
     UWordBounds { string: s, cat: None, catb: None }
 }

 #[inline]
 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
     UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
 }

 #[inline]
 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
     use super::UnicodeSegmentation;
     use tables::util::is_alphanumeric;

     fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

     UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
 }
	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	use core::cmp;
	use core::iter::Filter;

	use tables::word::WordCat;

	/// An iterator over the substrings of a string which, after splitting the string on
	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
	/// contain any characters with the
	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
	/// property, or with
	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
	///
	/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
	/// its documentation for more.
	///
	/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	pub struct UnicodeWords<'a> {
	inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
	}

	impl<'a> Iterator for UnicodeWords<'a> {
	type Item = &'a str;

	#[inline]
	fn next(&mut self) -> Option<&'a str> { self.inner.next() }
	}
	impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
	#[inline]
	fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
	}

	/// External iterator for a string's
	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
	///
	/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
	/// trait. See its documentation for more.
	///
	/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	#[derive(Clone)]
	pub struct UWordBounds<'a> {
	string: &'a str,
	cat: Option<WordCat>,
	catb: Option<WordCat>,
	}

	/// External iterator for word boundaries and byte offsets.
	///
	/// This struct is created by the [`split_word_bound_indices`] method on the
	/// [`UnicodeSegmentation`] trait. See its documentation for more.
	///
	/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	#[derive(Clone)]
	pub struct UWordBoundIndices<'a> {
	start_offset: usize,
	iter: UWordBounds<'a>,
	}

	impl<'a> UWordBoundIndices<'a> {
	#[inline]
	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
	///
	/// ```rust
	/// # use unicode_segmentation::UnicodeSegmentation;
	/// let mut iter = "Hello world".split_word_bound_indices();
	/// assert_eq!(iter.as_str(), "Hello world");
	/// iter.next();
	/// assert_eq!(iter.as_str(), " world");
	/// iter.next();
	/// assert_eq!(iter.as_str(), "world");
	/// ```
	pub fn as_str(&self) -> &'a str {
	self.iter.as_str()
	}
	}

	impl<'a> Iterator for UWordBoundIndices<'a> {
	type Item = (usize, &'a str);

	#[inline]
	fn next(&mut self) -> Option<(usize, &'a str)> {
	self.iter.next().map(\|s\| (s.as_ptr() as usize - self.start_offset, s))
	}

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
	self.iter.size_hint()
	}
	}

	impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
	#[inline]
	fn next_back(&mut self) -> Option<(usize, &'a str)> {
	self.iter.next_back().map(\|s\| (s.as_ptr() as usize - self.start_offset, s))
	}
	}

	// state machine for word boundary rules
	#[derive(Clone,Copy,PartialEq,Eq,Debug)]
	enum UWordBoundsState {
	Start,
	Letter,
	HLetter,
	Numeric,
	Katakana,
	ExtendNumLet,
	Regional(RegionalState),
	FormatExtend(FormatExtendType),
	Zwj,
	Emoji,
	WSegSpace,
	}

	// subtypes for FormatExtend state in UWordBoundsState
	#[derive(Clone,Copy,PartialEq,Eq,Debug)]
	enum FormatExtendType {
	AcceptAny,
	AcceptNone,
	RequireLetter,
	RequireHLetter,
	AcceptQLetter,
	RequireNumeric,
	}

	#[derive(Clone,Copy,PartialEq,Eq,Debug)]
	enum RegionalState {
	Half,
	Full,
	Unknown,
	}

	fn is_emoji(ch: char) -> bool {
	use tables::emoji;
	emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
	}

	impl<'a> Iterator for UWordBounds<'a> {
	type Item = &'a str;

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
	let slen = self.string.len();
	(cmp::min(slen, 1), Some(slen))
	}

	#[inline]
	fn next(&mut self) -> Option<&'a str> {
	use self::UWordBoundsState::*;
	use self::FormatExtendType::*;
	use tables::word as wd;
	if self.string.len() == 0 {
	return None;
	}

	let mut take_curr = true;
	let mut take_cat = true;
	let mut idx = 0;
	let mut saveidx = 0;
	let mut state = Start;
	let mut cat = wd::WC_Any;
	let mut savecat = wd::WC_Any;

	// Whether or not the previous category was ZWJ
	// ZWJs get collapsed, so this handles precedence of WB3c over WB4
	let mut prev_zwj;
	// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
	let mut skipped_format_extend = false;
	for (curr, ch) in self.string.char_indices() {
	idx = curr;
	prev_zwj = cat == wd::WC_ZWJ;
	// if there's a category cached, grab it
	cat = match self.cat {
	None => wd::word_category(ch).2,
	_ => self.cat.take().unwrap()
	};
	take_cat = true;

	// handle rule WB4
	// just skip all format, extend, and zwj chars
	// note that Start is a special case: if there's a bunch of Format \| Extend
	// characters at the beginning of a block of text, dump them out as one unit.
	//
	// (This is not obvious from the wording of UAX#29, but if you look at the
	// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
	// then the "correct" interpretation of WB4 becomes apparent.)
	if state != Start {
	match cat {
	wd::WC_Extend \| wd::WC_Format \| wd::WC_ZWJ => {
	skipped_format_extend = true;
	continue
	}
	_ => {}
	}
	}

	// rule WB3c
	// WB4 makes all ZWJs collapse into the previous state
	// but you can still be in a Zwj state if you started with Zwj
	//
	// This means that an EP + Zwj will collapse into EP, which is wrong,
	// since EP+EP is not a boundary but EP+ZWJ+EP is
	//
	// Thus, we separately keep track of whether or not the last character
	// was a ZWJ. This is an additional bit of state tracked outside of the
	// state enum; the state enum represents the last non-zwj state encountered.
	// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
	// however we are in the previous state for the purposes of all other rules.
	if prev_zwj {
	if is_emoji(ch) {
	state = Emoji;
	continue;
	}
	}
	// Don't use `continue` in this match without updating `cat`
	state = match state {
	Start if cat == wd::WC_CR => {
	idx += match self.get_next_cat(idx) {
	Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
	_ => 0
	};
	break; // rule WB3a
	},
	Start => match cat {
	wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
	wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
	wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
	wd::WC_Katakana => Katakana, // rule WB13, WB13a
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
	wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
	wd::WC_LF \| wd::WC_Newline => break, // rule WB3a
	wd::WC_ZWJ => Zwj, // rule WB3c
	wd::WC_WSegSpace => WSegSpace, // rule WB3d
	_ => {
	if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
	if ncat == wd::WC_Format \|\| ncat == wd::WC_Extend \|\| ncat == wd::WC_ZWJ {
	state = FormatExtend(AcceptNone);
	self.cat = Some(ncat);
	continue;
	}
	}
	break; // rule WB999
	}
	},
	WSegSpace => match cat {
	wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
	_ => {
	take_curr = false;
	break;
	}
	},
	Zwj => {
	// We already handle WB3c above.
	take_curr = false;
	break;
	}
	Letter \| HLetter => match cat {
	wd::WC_ALetter => Letter, // rule WB5
	wd::WC_Hebrew_Letter => HLetter, // rule WB5
	wd::WC_Numeric => Numeric, // rule WB9
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
	wd::WC_Double_Quote if state == HLetter => {
	savecat = cat;
	saveidx = idx;
	FormatExtend(RequireHLetter) // rule WB7b
	},
	wd::WC_Single_Quote if state == HLetter => {
	FormatExtend(AcceptQLetter) // rule WB7a
	},
	wd::WC_MidLetter \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
	savecat = cat;
	saveidx = idx;
	FormatExtend(RequireLetter) // rule WB6
	},
	_ => {
	take_curr = false;
	break;
	}
	},
	Numeric => match cat {
	wd::WC_Numeric => Numeric, // rule WB8
	wd::WC_ALetter => Letter, // rule WB10
	wd::WC_Hebrew_Letter => HLetter, // rule WB10
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
	wd::WC_MidNum \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
	savecat = cat;
	saveidx = idx;
	FormatExtend(RequireNumeric) // rule WB12
	},
	_ => {
	take_curr = false;
	break;
	}
	},
	Katakana => match cat {
	wd::WC_Katakana => Katakana, // rule WB13
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
	_ => {
	take_curr = false;
	break;
	}
	},
	ExtendNumLet => match cat {
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
	wd::WC_ALetter => Letter, // rule WB13b
	wd::WC_Hebrew_Letter => HLetter, // rule WB13b
	wd::WC_Numeric => Numeric, // rule WB13b
	wd::WC_Katakana => Katakana, // rule WB13b
	_ => {
	take_curr = false;
	break;
	}
	},
	Regional(RegionalState::Full) => {
	// if it reaches here we've gone too far,
	// a full flag can only compose with ZWJ/Extend/Format
	// proceeding it.
	take_curr = false;
	break;
	}
	Regional(RegionalState::Half) => match cat {
	wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
	_ => {
	take_curr = false;
	break;
	}
	},
	Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
	Emoji => {
	// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
	take_curr = false;
	break;
	},
	FormatExtend(t) => match t { // handle FormatExtends depending on what type
	RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
	RequireLetter \| AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
	RequireLetter \| AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
	RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
	AcceptNone \| AcceptQLetter => {
	take_curr = false; // emit all the Format\|Extend characters
	take_cat = false;
	break;
	},
	_ => break // rewind (in if statement below)
	}
	}
	}

	if let FormatExtend(t) = state {
	// we were looking for something and didn't find it; we have to back up
	if t == RequireLetter \|\| t == RequireHLetter \|\| t == RequireNumeric {
	idx = saveidx;
	cat = savecat;
	take_curr = false;
	}
	}

	self.cat = if take_curr {
	idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
	None
	} else if take_cat {
	Some(cat)
	} else {
	None
	};

	let retstr = &self.string[..idx];
	self.string = &self.string[idx..];
	Some(retstr)
	}
	}

	impl<'a> DoubleEndedIterator for UWordBounds<'a> {
	#[inline]
	fn next_back(&mut self) -> Option<&'a str> {
	use self::UWordBoundsState::*;
	use self::FormatExtendType::*;
	use tables::word as wd;
	if self.string.len() == 0 {
	return None;
	}

	let mut take_curr = true;
	let mut take_cat = true;
	let mut idx = self.string.len();
	idx -= self.string.chars().next_back().unwrap().len_utf8();
	let mut previdx = idx;
	let mut saveidx = idx;
	let mut state = Start;
	let mut savestate = Start;
	let mut cat = wd::WC_Any;

	let mut skipped_format_extend = false;

	for (curr, ch) in self.string.char_indices().rev() {
	previdx = idx;
	idx = curr;

	// if there's a category cached, grab it
	cat = match self.catb {
	None => wd::word_category(ch).2,
	_ => self.catb.take().unwrap()
	};
	take_cat = true;

	// backward iterator over word boundaries. Mostly the same as the forward
	// iterator, with two weirdnesses:
	// (1) If we encounter a single quote in the Start state, we have to check for a
	// Hebrew Letter immediately before it.
	// (2) Format and Extend char handling takes some gymnastics.

	if cat == wd::WC_Extend
	\|\| cat == wd::WC_Format
	\|\| (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
	// fold in that case
	if match state {
	FormatExtend(_) \| Start => false,
	_ => true
	} {
	saveidx = previdx;
	savestate = state;
	state = FormatExtend(AcceptNone);
	}

	if state != Start {
	continue;
	}
	} else if state == FormatExtend(AcceptNone) {
	// finished a scan of some Format\|Extend chars, restore previous state
	state = savestate;
	previdx = saveidx;
	take_cat = false;
	skipped_format_extend = true;
	}

	// Don't use `continue` in this match without updating `catb`
	state = match state {
	Start \| FormatExtend(AcceptAny) => match cat {
	_ if is_emoji(ch) => Zwj,
	wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
	wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
	wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
	wd::WC_Katakana => Katakana, // rule WB13, WB13b
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
	wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
	// rule WB4:
	wd::WC_Extend \| wd::WC_Format \| wd::WC_ZWJ => FormatExtend(AcceptAny),
	wd::WC_Single_Quote => {
	saveidx = idx;
	FormatExtend(AcceptQLetter) // rule WB7a
	},
	wd::WC_WSegSpace => WSegSpace,
	wd::WC_CR \| wd::WC_LF \| wd::WC_Newline => {
	if state == Start {
	if cat == wd::WC_LF {
	idx -= match self.get_prev_cat(idx) {
	Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
	_ => 0
	};
	}
	} else {
	take_curr = false;
	}
	break; // rule WB3a
	},
	_ => break // rule WB999
	},
	Zwj => match cat { // rule WB3c
	wd::WC_ZWJ => {
	FormatExtend(AcceptAny)
	}
	_ => {
	take_curr = false;
	break;
	}
	},
	WSegSpace => match cat { // rule WB3d
	wd::WC_WSegSpace if !skipped_format_extend => {
	WSegSpace
	}
	_ => {
	take_curr = false;
	break;
	}
	},
	Letter \| HLetter => match cat {
	wd::WC_ALetter => Letter, // rule WB5
	wd::WC_Hebrew_Letter => HLetter, // rule WB5
	wd::WC_Numeric => Numeric, // rule WB10
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
	wd::WC_Double_Quote if state == HLetter => {
	saveidx = previdx;
	FormatExtend(RequireHLetter) // rule WB7c
	},
	wd::WC_MidLetter \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
	saveidx = previdx;
	FormatExtend(RequireLetter) // rule WB7
	},
	_ => {
	take_curr = false;
	break;
	}
	},
	Numeric => match cat {
	wd::WC_Numeric => Numeric, // rule WB8
	wd::WC_ALetter => Letter, // rule WB9
	wd::WC_Hebrew_Letter => HLetter, // rule WB9
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
	wd::WC_MidNum \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
	saveidx = previdx;
	FormatExtend(RequireNumeric) // rule WB11
	},
	_ => {
	take_curr = false;
	break;
	}
	},
	Katakana => match cat {
	wd::WC_Katakana => Katakana, // rule WB13
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
	_ => {
	take_curr = false;
	break;
	}
	},
	ExtendNumLet => match cat {
	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
	wd::WC_ALetter => Letter, // rule WB13a
	wd::WC_Hebrew_Letter => HLetter, // rule WB13a
	wd::WC_Numeric => Numeric, // rule WB13a
	wd::WC_Katakana => Katakana, // rule WB13a
	_ => {
	take_curr = false;
	break;
	}
	},
	Regional(mut regional_state) => match cat {
	// rule WB13c
	wd::WC_Regional_Indicator => {
	if regional_state == RegionalState::Unknown {
	let count = self.string[..previdx]
	.chars().rev()
	.map(\|c\| wd::word_category(c).2)
	.filter(\|&c\| ! (c == wd::WC_ZWJ \|\| c == wd::WC_Extend \|\| c == wd::WC_Format))
	.take_while(\|&c\| c == wd::WC_Regional_Indicator)
	.count();
	regional_state = if count % 2 == 0 {
	RegionalState::Full
	} else {
	RegionalState::Half
	};
	}
	if regional_state == RegionalState::Full {
	take_curr = false;
	break;
	} else {
	Regional(RegionalState::Full)
	}
	}
	_ => {
	take_curr = false;
	break;
	}
	},
	Emoji => {
	if is_emoji(ch) { // rule WB3c
	Zwj
	} else {
	take_curr = false;
	break;
	}
	},
	FormatExtend(t) => match t {
	RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
	RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
	RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
	AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
	RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
	_ => break // backtrack will happens
	}
	}
	}

	if let FormatExtend(t) = state {
	// if we required something but didn't find it, backtrack
	if t == RequireLetter \|\| t == RequireHLetter \|\|
	t == RequireNumeric \|\| t == AcceptNone \|\| t == AcceptQLetter {
	previdx = saveidx;
	take_cat = false;
	take_curr = false;
	}
	}

	self.catb = if take_curr {
	None
	} else {
	idx = previdx;
	if take_cat {
	Some(cat)
	} else {
	None
	}
	};

	let retstr = &self.string[idx..];
	self.string = &self.string[..idx];
	Some(retstr)
	}
	}

	impl<'a> UWordBounds<'a> {
	#[inline]
	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
	///
	/// ```rust
	/// # use unicode_segmentation::UnicodeSegmentation;
	/// let mut iter = "Hello world".split_word_bounds();
	/// assert_eq!(iter.as_str(), "Hello world");
	/// iter.next();
	/// assert_eq!(iter.as_str(), " world");
	/// iter.next();
	/// assert_eq!(iter.as_str(), "world");
	/// ```
	pub fn as_str(&self) -> &'a str {
	self.string
	}

	#[inline]
	fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
	use tables::word as wd;
	let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
	if nidx < self.string.len() {
	let nch = self.string[nidx..].chars().next().unwrap();
	Some(wd::word_category(nch).2)
	} else {
	None
	}
	}

	#[inline]
	fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
	use tables::word as wd;
	if idx > 0 {
	let nch = self.string[..idx].chars().next_back().unwrap();
	Some(wd::word_category(nch).2)
	} else {
	None
	}
	}
	}

	#[inline]
	pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
	UWordBounds { string: s, cat: None, catb: None }
	}

	#[inline]
	pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
	UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
	}

	#[inline]
	pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
	use super::UnicodeSegmentation;
	use tables::util::is_alphanumeric;

	fn has_alphanumeric(s: &&str) -> bool { s.chars().any(\|c\| is_alphanumeric(c)) }
	let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

	UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
	}