blob: ec2ba33c74a3302b2b80979296f2078bd3b2c4d2 [file] [log] [blame] [edit]
// Copyright 2012-2015 The Rust Project Developers.
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Unicode Grapheme Clusters of a string.
//!
//! ## References
//!
//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
use std::cmp;
use unic_ucd_segment::GraphemeClusterBreak as GCB;
/// External iterator for grapheme clusters and byte offsets.
#[derive(Clone, Debug)]
pub struct GraphemeIndices<'a> {
start_offset: usize,
iter: Graphemes<'a>,
}
impl<'a> GraphemeIndices<'a> {
/// Create new iterator for *extended grapheme clusters*.
#[inline]
pub fn new(s: &str) -> GraphemeIndices<'_> {
GraphemeIndices {
start_offset: s.as_ptr() as usize,
iter: Graphemes::new(s),
}
}
/// Create new iterator for *legacy grapheme clusters*.
#[inline]
pub fn new_legacy(s: &str) -> GraphemeIndices<'_> {
GraphemeIndices {
start_offset: s.as_ptr() as usize,
iter: Graphemes::new_legacy(s),
}
}
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unic_segment::GraphemeIndices;
/// let mut iter = GraphemeIndices::new("abc");
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for GraphemeIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
/// External iterator for a string's
/// [grapheme clusters](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
#[derive(Clone, Debug)]
pub struct Graphemes<'a> {
string: &'a str,
cursor: GraphemeCursor,
cursor_back: GraphemeCursor,
}
impl<'a> Graphemes<'a> {
/// Create new iterator for *extended grapheme clusters*.
#[inline]
pub fn new(s: &str) -> Graphemes<'_> {
let len = s.len();
Graphemes {
string: s,
cursor: GraphemeCursor::new(0, len),
cursor_back: GraphemeCursor::new(len, len),
}
}
/// Create new iterator for *legacy grapheme clusters*.
#[inline]
pub fn new_legacy(s: &str) -> Graphemes<'_> {
let len = s.len();
Graphemes {
string: s,
cursor: GraphemeCursor::new_legacy(0, len),
cursor_back: GraphemeCursor::new_legacy(len, len),
}
}
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unic_segment::Graphemes;
/// let mut iter = Graphemes::new("abc");
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
}
}
impl<'a> Iterator for Graphemes<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
let start = self.cursor.cur_cursor();
if start == self.cursor_back.cur_cursor() {
return None;
}
let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
Some(&self.string[start..next])
}
}
impl<'a> DoubleEndedIterator for Graphemes<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
let end = self.cursor_back.cur_cursor();
if end == self.cursor.cur_cursor() {
return None;
}
let prev = self
.cursor_back
.prev_boundary(self.string, 0)
.unwrap()
.unwrap();
Some(&self.string[prev..end])
}
}
// maybe unify with PairResult?
// An enum describing information about a potential boundary.
#[derive(Clone, Debug, Eq, PartialEq)]
enum GraphemeState {
// No information is known.
Unknown,
// It is known to not be a boundary.
NotBreak,
// It is known to be a boundary.
Break,
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
Regional,
// The codepoint after is in the E_Modifier category, so whether it's a boundary
// depends on pre-context according to GB10.
Emoji,
}
/// Cursor-based segmenter for grapheme clusters.
#[derive(Clone, Debug)]
pub struct GraphemeCursor {
/// Current cursor position.
offset: usize,
/// Total length of the string.
len: usize,
/// A config flag indicating whether this cursor computes legacy or extended grapheme cluster
/// boundaries (enables GB9a and GB9b if set).
is_extended: bool,
/// Information about the potential boundary at `offset`.
state: GraphemeState,
/// Category of codepoint immediately preceding cursor, if known.
cat_before: Option<GCB>,
/// Category of codepoint immediately after cursor, if known.
cat_after: Option<GCB>,
/// If set, at least one more codepoint immediately preceding this offset is needed to resolve
/// whether there's a boundary at `offset`.
pre_context_offset: Option<usize>,
/// The number of RIS codepoints preceding `offset`. If `pre_context_offset` is set, then counts
/// the number of RIS between that and `offset`, otherwise is an accurate count relative to the
/// string.
ris_count: Option<usize>,
/// Set if a call to `prev_boundary` or `next_boundary` was suspended due to needing more input.
resuming: bool,
}
/// An error return indicating that not enough content was available in the
/// provided chunk to satisfy the query, and that more content must be provided.
#[derive(Debug, Eq, PartialEq)]
pub enum GraphemeIncomplete {
/// More pre-context is needed. The caller should call `provide_context`
/// with a chunk ending at the offset given, then retry the query. This
/// will only be returned if the `chunk_start` parameter is nonzero.
PreContext(usize),
/// When requesting `prev_boundary`, the cursor is moving past the beginning
/// of the current chunk, so the chunk before that is requested. This will
/// only be returned if the `chunk_start` parameter is nonzero.
PrevChunk,
/// When requesting `next_boundary`, the cursor is moving past the end of the
/// current chunk, so the chunk after that is requested. This will only be
/// returned if the chunk ends before the `len` parameter provided on
/// creation of the cursor.
NextChunk, // requesting chunk following the one given
/// An error returned when the chunk given does not contain the cursor position.
InvalidOffset,
}
// An enum describing the result from lookup of a pair of categories.
#[derive(Eq, PartialEq)]
enum PairResult {
/// definitely not a break
NotBreak,
/// definitely a break
Break,
/// a break iff not in extended mode
Extended,
/// a break if preceded by an even number of Regional Indicators
Regional,
/// a break if preceded by Emoji Base and (Extend)*
Emoji,
}
fn check_pair(before: GCB, after: GCB) -> PairResult {
use self::PairResult::*;
#[cfg_attr(feature = "cargo-clippy", allow(match_same_arms))]
match (before, after) {
// Do not break between a CR and LF. Otherwise, break before and after controls.
(GCB::CR, GCB::LF) => NotBreak, // GB3
(GCB::Control, _) => Break, // GB4
(GCB::CR, _) => Break, // GB4
(GCB::LF, _) => Break, // GB4
(_, GCB::Control) => Break, // GB5
(_, GCB::CR) => Break, // GB5
(_, GCB::LF) => Break, // GB5
// Do not break Hangul syllable sequences.
(GCB::L, GCB::L) => NotBreak, // GB6
(GCB::L, GCB::V) => NotBreak, // GB6
(GCB::L, GCB::LV) => NotBreak, // GB6
(GCB::L, GCB::LVT) => NotBreak, // GB6
(GCB::LV, GCB::V) => NotBreak, // GB7
(GCB::LV, GCB::T) => NotBreak, // GB7
(GCB::V, GCB::V) => NotBreak, // GB7
(GCB::V, GCB::T) => NotBreak, // GB7
(GCB::LVT, GCB::T) => NotBreak, // GB8
(GCB::T, GCB::T) => NotBreak, // GB8
// Do not break before extending characters or ZWJ.
(_, GCB::Extend) => NotBreak, // GB9
(_, GCB::ZWJ) => NotBreak, // GB9
// Only for extended grapheme clusters:
// Do not break before SpacingMarks, or after Prepend characters.
(_, GCB::SpacingMark) => Extended, // GB9a
(GCB::Prepend, _) => Extended, // GB9b
// Do not break within Emoji Modifier Sequences or Emoji ZWJ Sequences.
(GCB::EBase, GCB::EModifier) => NotBreak, // GB10
(GCB::EBaseGAZ, GCB::EModifier) => NotBreak, // GB10
(GCB::Extend, GCB::EModifier) => Emoji, // GB10
(GCB::ZWJ, GCB::GlueAfterZwj) => NotBreak, // GB11
(GCB::ZWJ, GCB::EBaseGAZ) => NotBreak, // GB11
// Do not break within emoji flag sequences. That is, do not break between regional
// indicator (RI) symbols if there is an odd number of RI characters before the break point.
(GCB::RegionalIndicator, GCB::RegionalIndicator) => Regional, // GB12, GB13
// Otherwise, break everywhere.
(_, _) => Break, // GB999
}
}
impl GraphemeCursor {
/// Create a new cursor. The string and initial offset are given at creation
/// time, but the contents of the string are not.
///
/// The `offset` parameter must be on a codepoint boundary.
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// let s = "हिन्दी";
/// let mut extended = GraphemeCursor::new(0, s.len());
/// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
/// ```
pub fn new(offset: usize, len: usize) -> GraphemeCursor {
let state = if offset == 0 || offset == len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
GraphemeCursor {
offset,
len,
state,
is_extended: true,
cat_before: None,
cat_after: None,
pre_context_offset: None,
ris_count: None,
resuming: false,
}
}
/// Create a new cursor. The string and initial offset are given at creation
/// time, but the contents of the string are not.
///
/// The `offset` parameter must be on a codepoint boundary.
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// let s = "हिन्दी";
/// let mut legacy = GraphemeCursor::new_legacy(0, s.len());
/// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
/// ```
pub fn new_legacy(offset: usize, len: usize) -> GraphemeCursor {
let state = if offset == 0 || offset == len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
GraphemeCursor {
offset,
len,
state,
is_extended: false,
cat_before: None,
cat_after: None,
pre_context_offset: None,
ris_count: None,
resuming: false,
}
}
// FIXME: Not sure I'm gonna keep this, the advantage over new() seems thin.
/// Set the cursor to a new location in the same string.
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(0, s.len());
/// assert_eq!(cursor.cur_cursor(), 0);
/// cursor.set_cursor(2);
/// assert_eq!(cursor.cur_cursor(), 2);
/// ```
pub fn set_cursor(&mut self, offset: usize) {
if offset != self.offset {
self.offset = offset;
self.state = if offset == 0 || offset == self.len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
// reset state derived from text around cursor
self.cat_before = None;
self.cat_after = None;
self.ris_count = None;
}
}
/// The current offset of the cursor. Equal to the last value provided to
/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
/// `prev_boundary()`.
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(4, flags.len());
/// assert_eq!(cursor.cur_cursor(), 4);
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.cur_cursor(), 8);
/// ```
pub fn cur_cursor(&self) -> usize {
self.offset
}
/// Provide additional pre-context when it is needed to decide a boundary.
/// The end of the chunk must coincide with the value given in the
/// `GraphemeIncomplete::PreContext` request.
///
/// ```rust
/// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(8, flags.len());
///
/// // Not enough pre-context to decide if there's a boundary between the two flags.
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
///
/// // Provide one more Regional Indicator Symbol of pre-context
/// cursor.provide_context(&flags[4..8], 4);
///
/// // Still not enough context to decide.
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
///
/// // Provide additional requested context.
/// cursor.provide_context(&flags[0..4], 0);
///
/// // That's enough to decide (it always is when context goes to the start of the string)
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
/// ```
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
if GCB::of(ch) == GCB::Prepend {
self.decide(false); // GB9b
return;
}
}
match self.state {
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => panic!("invalid state"),
}
}
fn decide(&mut self, is_break: bool) {
self.state = if is_break {
GraphemeState::Break
} else {
GraphemeState::NotBreak
};
}
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
self.decide(is_break);
Ok(is_break)
}
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
if self.state == GraphemeState::Break {
Ok(true)
} else if self.state == GraphemeState::NotBreak {
Ok(false)
} else if let Some(pre_context_offset) = self.pre_context_offset {
Err(GraphemeIncomplete::PreContext(pre_context_offset))
} else {
unreachable!("inconsistent state");
}
}
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
if GCB::of(ch) != GCB::RegionalIndicator {
self.ris_count = Some(ris_count);
self.decide((ris_count % 2) == 0);
return;
}
ris_count += 1;
}
self.ris_count = Some(ris_count);
if chunk_start == 0 {
self.decide((ris_count % 2) == 0);
return;
}
self.pre_context_offset = Some(chunk_start);
}
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
for ch in chunk.chars().rev() {
match GCB::of(ch) {
GCB::Extend => (),
GCB::EBase | GCB::EBaseGAZ => {
self.decide(false);
return;
}
_ => {
self.decide(true);
return;
}
}
}
if chunk_start == 0 {
self.decide(true);
return;
}
self.pre_context_offset = Some(chunk_start);
}
// TODO(clippy): Fix clippy warning or leave it as allowed if really needed.
// `warning: methods called `is_*` usually take self by reference or no self; consider choosing
// a less ambiguous name`
#[cfg_attr(feature = "cargo-clippy", allow(wrong_self_convention))]
/// Determine whether the current cursor location is a grapheme cluster boundary.
/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
/// the length of `chunk` is not equal to `len` on creation, then this method
/// may return `GraphemeIncomplete::PreContext`. The caller should then
/// call `provide_context` with the requested chunk, then retry calling this
/// method.
///
/// For partial chunks, if the cursor is not at the beginning or end of the
/// string, the chunk should contain at least the codepoint following the cursor.
/// If the string is nonempty, the chunk must be nonempty.
///
/// All calls should have consistent chunk contents (ie, if a chunk provides
/// content for a given slice, all further chunks covering that slice must have
/// the same content for it).
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(8, flags.len());
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
/// cursor.set_cursor(12);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
/// ```
pub fn is_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<bool, GraphemeIncomplete> {
if self.state == GraphemeState::Break {
return Ok(true);
}
if self.state == GraphemeState::NotBreak {
return Ok(false);
}
if (self.offset < chunk_start || self.offset >= chunk_start + chunk.len())
&& (self.offset > chunk_start + chunk.len() || self.cat_after.is_none())
{
return Err(GraphemeIncomplete::InvalidOffset);
}
if let Some(pre_context_offset) = self.pre_context_offset {
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
}
let offset_in_chunk = self.offset - chunk_start;
if self.cat_after.is_none() {
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
self.cat_after = Some(GCB::of(ch));
}
if self.offset == chunk_start {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
GCB::RegionalIndicator => self.state = GraphemeState::Regional,
GCB::EModifier => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
self.pre_context_offset = Some(chunk_start);
return Err(GraphemeIncomplete::PreContext(chunk_start));
}
}
if self.cat_before.is_none() {
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
self.cat_before = Some(GCB::of(ch));
}
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
PairResult::NotBreak => self.decision(false),
PairResult::Break => self.decision(true),
PairResult::Extended => {
let is_extended = self.is_extended;
self.decision(!is_extended)
}
PairResult::Regional => {
if let Some(ris_count) = self.ris_count {
return self.decision((ris_count % 2) == 0);
}
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
PairResult::Emoji => {
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
}
}
/// Find the next boundary after the current cursor position. Only a part of
/// the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
/// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
/// call `provide_context` with the requested chunk, then retry. In the
/// latter case, the caller should provide the chunk following the one
/// given, then retry.
///
/// See `is_boundary` for expectations on the provided chunk.
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(4, flags.len());
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
/// ```
///
/// And an example that uses partial strings:
///
/// ```rust
/// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(0, s.len());
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
pub fn next_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == self.len {
return Ok(None);
}
let mut iter = chunk[self.offset - chunk_start..].chars();
let mut ch = iter.next().unwrap();
loop {
if self.resuming {
if self.cat_after.is_none() {
self.cat_after = Some(GCB::of(ch));
}
} else {
self.offset += ch.len_utf8();
self.state = GraphemeState::Unknown;
self.cat_before = self.cat_after.take();
if self.cat_before.is_none() {
self.cat_before = Some(GCB::of(ch));
}
if self.cat_before == Some(GCB::RegionalIndicator) {
self.ris_count = self.ris_count.map(|c| c + 1);
} else {
self.ris_count = Some(0);
}
if let Some(next_ch) = iter.next() {
ch = next_ch;
self.cat_after = Some(GCB::of(ch));
} else if self.offset == self.len {
self.decide(true);
} else {
self.resuming = true;
return Err(GraphemeIncomplete::NextChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
/// Find the previous boundary after the current cursor position. Only a part
/// of the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
/// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
/// call `provide_context` with the requested chunk, then retry. In the
/// latter case, the caller should provide the chunk preceding the one
/// given, then retry.
///
/// See `is_boundary` for expectations on the provided chunk.
///
/// ```rust
/// # use unic_segment::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(12, flags.len());
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
/// ```
///
/// And an example that uses partial strings (note the exact return is not
/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
///
/// ```rust
/// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(4, s.len());
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
/// ```
pub fn prev_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == 0 {
return Ok(None);
}
let mut iter = chunk[..self.offset - chunk_start].chars().rev();
let mut ch = iter.next().unwrap();
loop {
if self.offset == chunk_start {
self.resuming = true;
return Err(GraphemeIncomplete::PrevChunk);
}
if self.resuming {
self.cat_before = Some(GCB::of(ch));
} else {
self.offset -= ch.len_utf8();
self.cat_after = self.cat_before.take();
self.state = GraphemeState::Unknown;
if let Some(ris_count) = self.ris_count {
self.ris_count = if ris_count > 0 {
Some(ris_count - 1)
} else {
None
};
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
self.cat_before = Some(GCB::of(ch));
} else if self.offset == 0 {
self.decide(true);
} else {
self.resuming = true;
return Err(GraphemeIncomplete::PrevChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
}
#[cfg(test)]
mod tests {
use super::{GraphemeIndices, Graphemes};
#[test]
fn test_grapheme_indices() {
let input = "a̐éö̲\r\n";
let grapheme_indices = GraphemeIndices::new(input).collect::<Vec<(usize, &str)>>();
assert_eq!(
grapheme_indices,
&[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
);
let grapheme_indices = GraphemeIndices::new(input)
.rev()
.collect::<Vec<(usize, &str)>>();
assert_eq!(
grapheme_indices,
&[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")]
);
let mut grapheme_indices_iter = GraphemeIndices::new(input);
{
let grapheme_indices = grapheme_indices_iter.by_ref();
let e1 = grapheme_indices.size_hint();
assert_eq!(e1, (1, Some(13)));
let c = grapheme_indices.count();
assert_eq!(c, 4);
}
assert_eq!(grapheme_indices_iter.size_hint(), (0, Some(0)));
}
#[test]
fn test_graphemes() {
let input = "a̐éö̲\r\n";
let graphemes = Graphemes::new(input).collect::<Vec<&str>>();
assert_eq!(graphemes, &["a̐", "é", "ö̲", "\r\n"]);
// Make sure the reverse iterator does the right thing with "\n" at beginning of string.
let input = "\n\r\n\r";
let graphemes = Graphemes::new(input).rev().collect::<Vec<&str>>();
assert_eq!(graphemes, &["\r", "\r\n", "\n"]);
}
}