| /* GENERATED SOURCE. DO NOT MODIFY. */ |
| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2016, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| package android.icu.text; |
| |
| import java.text.CharacterIterator; |
| import java.text.StringCharacterIterator; |
| import java.util.Locale; |
| |
| import android.icu.util.ICUException; |
| import android.icu.util.ULocale; |
| |
| // Java porting note: |
| // |
| // The ICU4C implementation contains dead code in many places. |
| // While porting the ICU4C linear search implementation, this dead code |
| // was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***" |
| // are those dead code blocks, still available in ICU4C. |
| |
| // The ICU4C implementation does not seem to handle UCharacterIterator pointing |
| // to a fragment of text properly. ICU4J uses CharacterIterator to navigate through |
| // the input text. We need to carefully review the code ported from ICU4C |
| // assuming the start index is 0. |
| |
| // ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like |
| // CE is no longer used, except in a few places checking CELength. It looks like this |
| // is a leftover from already-disabled Boyer-Moore search code. This Java implementation |
| // preserves the code, but we should clean this up later. |
| |
| /** |
| * |
| * <tt>StringSearch</tt> is a {@link SearchIterator} that provides |
| * language-sensitive text searching based on the comparison rules defined |
| * in a {@link RuleBasedCollator} object. |
| * StringSearch ensures that language eccentricity can be |
| * handled, e.g. for the German collator, characters ß and SS will be matched |
| * if case is chosen to be ignored. |
| * See the <a href="https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm"> |
| * "ICU Collation Design Document"</a> for more information. |
| * <p> |
| * There are 2 match options for selection:<br> |
| * Let S' be the sub-string of a text string S between the offsets start and |
| * end [start, end]. |
| * <br> |
| * A pattern string P matches a text string S at the offsets [start, end] |
| * if |
| * <pre> |
| * option 1. Some canonical equivalent of P matches some canonical equivalent |
| * of S' |
| * option 2. P matches S' and if P starts or ends with a combining mark, |
| * there exists no non-ignorable combining mark before or after S? |
| * in S respectively. |
| * </pre> |
| * Option 2. is the default. |
| * <p> |
| * This search has APIs similar to that of other text iteration mechanisms |
| * such as the break iterators in {@link BreakIterator}. Using these |
| * APIs, it is easy to scan through text looking for all occurrences of |
| * a given pattern. This search iterator allows changing of direction by |
| * calling a {@link #reset} followed by a {@link #next} or {@link #previous}. |
| * Though a direction change can occur without calling {@link #reset} first, |
| * this operation comes with some speed penalty. |
| * Match results in the forward direction will match the result matches in |
| * the backwards direction in the reverse order |
| * <p> |
| * {@link SearchIterator} provides APIs to specify the starting position |
| * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex}, |
| * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}. |
| * Since the starting position will be set as it is specified, please take note that |
| * there are some danger points at which the search may render incorrect |
| * results: |
| * <ul> |
| * <li> In the midst of a substring that requires normalization. |
| * <li> If the following match is to be found, the position should not be the |
| * second character which requires swapping with the preceding |
| * character. Vice versa, if the preceding match is to be found, the |
| * position to search from should not be the first character which |
| * requires swapping with the next character. E.g certain Thai and |
| * Lao characters require swapping. |
| * <li> If a following pattern match is to be found, any position within a |
| * contracting sequence except the first will fail. Vice versa if a |
| * preceding pattern match is to be found, an invalid starting point |
| * would be any character within a contracting sequence except the last. |
| * </ul> |
| * <p> |
| * A {@link BreakIterator} can be used if only matches at logical breaks are desired. |
| * Using a {@link BreakIterator} will only give you results that exactly matches the |
| * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will |
| * not be found in the string "\u00e9" if a character break iterator is used. |
| * <p> |
| * Options are provided to handle overlapping matches. |
| * E.g. In English, overlapping matches produces the result 0 and 2 |
| * for the pattern "abab" in the text "ababab", where mutually |
| * exclusive matches only produces the result of 0. |
| * <p> |
| * Options are also provided to implement "asymmetric search" as described in |
| * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> |
| * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType |
| * values. |
| * <p> |
| * Though collator attributes will be taken into consideration while |
| * performing matches, there are no APIs here for setting and getting the |
| * attributes. These attributes can be set by getting the collator |
| * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}. |
| * Lastly to update <tt>StringSearch</tt> to the new collator attributes, |
| * {@link #reset} has to be called. |
| * <p> |
| * Restriction: <br> |
| * Currently there are no composite characters that consists of a |
| * character with combining class > 0 before a character with combining |
| * class == 0. However, if such a character exists in the future, |
| * <tt>StringSearch</tt> does not guarantee the results for option 1. |
| * <p> |
| * Consult the {@link SearchIterator} documentation for information on |
| * and examples of how to use instances of this class to implement text |
| * searching. |
| * <p> |
| * Note, <tt>StringSearch</tt> is not to be subclassed. |
| * </p> |
| * @see SearchIterator |
| * @see RuleBasedCollator |
| * @author Laura Werner, synwee |
| */ |
| // internal notes: all methods do not guarantee the correct status of the |
| // characteriterator. the caller has to maintain the original index position |
| // if necessary. methods could change the index position as it deems fit |
| public final class StringSearch extends SearchIterator { |
| |
| private Pattern pattern_; |
| private RuleBasedCollator collator_; |
| |
| // positions within the collation element iterator is used to determine |
| // if we are at the start of the text. |
| private CollationElementIterator textIter_; |
| private CollationPCE textProcessedIter_; |
| |
| // utility collation element, used throughout program for temporary |
| // iteration. |
| private CollationElementIterator utilIter_; |
| |
| private Normalizer2 nfd_; |
| |
| private int strength_; |
| int ceMask_; |
| int variableTop_; |
| |
| private boolean toShift_; |
| |
| // *** Boyer-Moore *** |
| // private char[] canonicalPrefixAccents_; |
| // private char[] canonicalSuffixAccents_; |
| |
| /** |
| * Initializes the iterator to use the language-specific rules defined in |
| * the argument collator to search for argument pattern in the argument |
| * target text. The argument <code>breakiter</code> is used to define logical matches. |
| * See super class documentation for more details on the use of the target |
| * text and {@link BreakIterator}. |
| * @param pattern text to look for. |
| * @param target target text to search for pattern. |
| * @param collator {@link RuleBasedCollator} that defines the language rules |
| * @param breakiter A {@link BreakIterator} that is used to determine the |
| * boundaries of a logical match. This argument can be null. |
| * @throws IllegalArgumentException thrown when argument target is null, |
| * or of length 0 |
| * @see BreakIterator |
| * @see RuleBasedCollator |
| */ |
| public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, |
| BreakIterator breakiter) { |
| |
| // This implementation is ported from ICU4C usearch_open() |
| |
| super(target, breakiter); |
| |
| // string search does not really work when numeric collation is turned on |
| if (collator.getNumericCollation()) { |
| throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch"); |
| } |
| |
| collator_ = collator; |
| strength_ = collator.getStrength(); |
| ceMask_ = getMask(strength_); |
| toShift_ = collator.isAlternateHandlingShifted(); |
| variableTop_ = collator.getVariableTop(); |
| |
| nfd_ = Normalizer2.getNFDInstance(); |
| |
| pattern_ = new Pattern(pattern); |
| |
| search_.setMatchedLength(0); |
| search_.matchedIndex_ = DONE; |
| |
| utilIter_ = null; |
| textIter_ = new CollationElementIterator(target, collator); |
| |
| textProcessedIter_ = null; |
| |
| // This is done by super class constructor |
| /* |
| search_.isOverlap_ = false; |
| search_.isCanonicalMatch_ = false; |
| search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; |
| search_.isForwardSearching_ = true; |
| search_.reset_ = true; |
| */ |
| ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); |
| search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); |
| search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone |
| |
| initialize(); |
| } |
| |
| /** |
| * Initializes the iterator to use the language-specific rules defined in |
| * the argument collator to search for argument pattern in the argument |
| * target text. No {@link BreakIterator}s are set to test for logical matches. |
| * @param pattern text to look for. |
| * @param target target text to search for pattern. |
| * @param collator {@link RuleBasedCollator} that defines the language rules |
| * @throws IllegalArgumentException thrown when argument target is null, |
| * or of length 0 |
| * @see RuleBasedCollator |
| */ |
| public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) { |
| this(pattern, target, collator, null); |
| } |
| |
| /** |
| * Initializes the iterator to use the language-specific rules and |
| * break iterator rules defined in the argument locale to search for |
| * argument pattern in the argument target text. |
| * @param pattern text to look for. |
| * @param target target text to search for pattern. |
| * @param locale locale to use for language and break iterator rules |
| * @throws IllegalArgumentException thrown when argument target is null, |
| * or of length 0. ClassCastException thrown if the collator for |
| * the specified locale is not a RuleBasedCollator. |
| */ |
| public StringSearch(String pattern, CharacterIterator target, Locale locale) { |
| this(pattern, target, ULocale.forLocale(locale)); |
| } |
| |
| /** |
| * Initializes the iterator to use the language-specific rules and |
| * break iterator rules defined in the argument locale to search for |
| * argument pattern in the argument target text. |
| * See super class documentation for more details on the use of the target |
| * text and {@link BreakIterator}. |
| * @param pattern text to look for. |
| * @param target target text to search for pattern. |
| * @param locale locale to use for language and break iterator rules |
| * @throws IllegalArgumentException thrown when argument target is null, |
| * or of length 0. ClassCastException thrown if the collator for |
| * the specified locale is not a RuleBasedCollator. |
| * @see BreakIterator |
| * @see RuleBasedCollator |
| * @see SearchIterator |
| */ |
| public StringSearch(String pattern, CharacterIterator target, ULocale locale) { |
| this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null); |
| } |
| |
| /** |
| * Initializes the iterator to use the language-specific rules and |
| * break iterator rules defined in the default locale to search for |
| * argument pattern in the argument target text. |
| * @param pattern text to look for. |
| * @param target target text to search for pattern. |
| * @throws IllegalArgumentException thrown when argument target is null, |
| * or of length 0. ClassCastException thrown if the collator for |
| * the default locale is not a RuleBasedCollator. |
| */ |
| public StringSearch(String pattern, String target) { |
| this(pattern, new StringCharacterIterator(target), |
| (RuleBasedCollator) Collator.getInstance(), null); |
| } |
| |
| /** |
| * Gets the {@link RuleBasedCollator} used for the language rules. |
| * <p> |
| * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any |
| * changes to the {@link RuleBasedCollator} result should follow with a call to |
| * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct |
| * search behavior. |
| * </p> |
| * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt> |
| * @see RuleBasedCollator |
| * @see #setCollator |
| */ |
| public RuleBasedCollator getCollator() { |
| return collator_; |
| } |
| |
| /** |
| * Sets the {@link RuleBasedCollator} to be used for language-specific searching. |
| * <p> |
| * The iterator's position will not be changed by this method. |
| * @param collator to use for this <tt>StringSearch</tt> |
| * @throws IllegalArgumentException thrown when collator is null |
| * @see #getCollator |
| */ |
| public void setCollator(RuleBasedCollator collator) { |
| if (collator == null) { |
| throw new IllegalArgumentException("Collator can not be null"); |
| } |
| collator_ = collator; |
| ceMask_ = getMask(collator_.getStrength()); |
| |
| ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); |
| search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); |
| search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone |
| |
| toShift_ = collator.isAlternateHandlingShifted(); |
| variableTop_ = collator.getVariableTop(); |
| textIter_ = new CollationElementIterator(pattern_.text_, collator); |
| utilIter_ = new CollationElementIterator(pattern_.text_, collator); |
| |
| // initialize() _after_ setting the iterators for the new collator. |
| initialize(); |
| } |
| |
| /** |
| * Returns the pattern for which <tt>StringSearch</tt> is searching for. |
| * @return the pattern searched for |
| */ |
| public String getPattern() { |
| return pattern_.text_; |
| } |
| |
| /** |
| * Set the pattern to search for. |
| * The iterator's position will not be changed by this method. |
| * @param pattern for searching |
| * @see #getPattern |
| * @exception IllegalArgumentException thrown if pattern is null or of |
| * length 0 |
| */ |
| public void setPattern(String pattern) { |
| if (pattern == null || pattern.length() <= 0) { |
| throw new IllegalArgumentException( |
| "Pattern to search for can not be null or of length 0"); |
| } |
| pattern_.text_ = pattern; |
| initialize(); |
| } |
| |
| /** |
| * Determines whether canonical matches (option 1, as described in the |
| * class documentation) is set. |
| * See setCanonical(boolean) for more information. |
| * @see #setCanonical |
| * @return true if canonical matches is set, false otherwise |
| */ |
| //TODO: hoist this to SearchIterator |
| public boolean isCanonical() { |
| return search_.isCanonicalMatch_; |
| } |
| |
| /** |
| * Set the canonical match mode. See class documentation for details. |
| * The default setting for this property is false. |
| * @param allowCanonical flag indicator if canonical matches are allowed |
| * @see #isCanonical |
| */ |
| //TODO: hoist this to SearchIterator |
| public void setCanonical(boolean allowCanonical) { |
| search_.isCanonicalMatch_ = allowCanonical; |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public void setTarget(CharacterIterator text) { |
| super.setTarget(text); |
| textIter_.setText(text); |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public int getIndex() { |
| int result = textIter_.getOffset(); |
| if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) { |
| return DONE; |
| } |
| return result; |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public void setIndex(int position) { |
| // Java porting note: This method is equivalent to setOffset() in ICU4C. |
| // ICU4C SearchIterator::setOffset() is a pure virtual method, while |
| // ICU4J SearchIterator.setIndex() is not abstract method. |
| |
| super.setIndex(position); |
| textIter_.setOffset(position); |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public void reset() { |
| // reset is setting the attributes that are already in |
| // string search, hence all attributes in the collator should |
| // be retrieved without any problems |
| |
| boolean sameCollAttribute = true; |
| int ceMask; |
| boolean shift; |
| int varTop; |
| |
| // **** hack to deal w/ how processed CEs encode quaternary **** |
| int newStrength = collator_.getStrength(); |
| if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY) |
| || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) { |
| sameCollAttribute = false; |
| } |
| |
| strength_ = collator_.getStrength(); |
| ceMask = getMask(strength_); |
| if (ceMask_ != ceMask) { |
| ceMask_ = ceMask; |
| sameCollAttribute = false; |
| } |
| |
| shift = collator_.isAlternateHandlingShifted(); |
| if (toShift_ != shift) { |
| toShift_ = shift; |
| sameCollAttribute = false; |
| } |
| |
| varTop = collator_.getVariableTop(); |
| if (variableTop_ != varTop) { |
| variableTop_ = varTop; |
| sameCollAttribute = false; |
| } |
| |
| if (!sameCollAttribute) { |
| initialize(); |
| } |
| |
| textIter_.setText(search_.text()); |
| |
| search_.setMatchedLength(0); |
| search_.matchedIndex_ = DONE; |
| search_.isOverlap_ = false; |
| search_.isCanonicalMatch_ = false; |
| search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; |
| search_.isForwardSearching_ = true; |
| search_.reset_ = true; |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| protected int handleNext(int position) { |
| if (pattern_.CELength_ == 0) { |
| search_.matchedIndex_ = search_.matchedIndex_ == DONE ? |
| getIndex() : search_.matchedIndex_ + 1; |
| search_.setMatchedLength(0); |
| textIter_.setOffset(search_.matchedIndex_); |
| if (search_.matchedIndex_ == search_.endIndex()) { |
| search_.matchedIndex_ = DONE; |
| } |
| } else { |
| if (search_.matchedLength() <= 0) { |
| // the flipping direction issue has already been handled |
| // in next() |
| // for boundary check purposes. this will ensure that the |
| // next match will not preceed the current offset |
| // note search_.matchedIndex_ will always be set to something |
| // in the code |
| search_.matchedIndex_ = position - 1; |
| } |
| |
| textIter_.setOffset(position); |
| |
| // ICU4C comment: |
| // if strsrch_->breakIter is always the same as m_breakiterator_ |
| // then we don't need to check the match boundaries here because |
| // usearch_handleNextXXX will already have done it. |
| if (search_.isCanonicalMatch_) { |
| // *could* actually use exact here 'cause no extra accents allowed... |
| handleNextCanonical(); |
| } else { |
| handleNextExact(); |
| } |
| |
| if (search_.matchedIndex_ == DONE) { |
| textIter_.setOffset(search_.endIndex()); |
| } else { |
| textIter_.setOffset(search_.matchedIndex_); |
| } |
| |
| return search_.matchedIndex_; |
| } |
| |
| return DONE; |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| protected int handlePrevious(int position) { |
| if (pattern_.CELength_ == 0) { |
| search_.matchedIndex_ = |
| search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_; |
| if (search_.matchedIndex_ == search_.beginIndex()) { |
| setMatchNotFound(); |
| } else { |
| search_.matchedIndex_--; |
| textIter_.setOffset(search_.matchedIndex_); |
| search_.setMatchedLength(0); |
| } |
| } else { |
| textIter_.setOffset(position); |
| |
| if (search_.isCanonicalMatch_) { |
| // *could* use exact match here since extra accents *not* allowed! |
| handlePreviousCanonical(); |
| } else { |
| handlePreviousExact(); |
| } |
| } |
| |
| return search_.matchedIndex_; |
| } |
| |
| // ------------------ Internal implementation code --------------------------- |
| |
| private static final int INITIAL_ARRAY_SIZE_ = 256; |
| |
| // *** Boyer-Moore *** |
| // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl; |
| // private static final int LAST_BYTE_MASK_ = 0xff; |
| // private static final int SECOND_LAST_BYTE_SHIFT_ = 8; |
| |
| private static final int PRIMARYORDERMASK = 0xffff0000; |
| private static final int SECONDARYORDERMASK = 0x0000ff00; |
| private static final int TERTIARYORDERMASK = 0x000000ff; |
| |
| /** |
| * Getting the mask for collation strength |
| * @param strength collation strength |
| * @return collation element mask |
| */ |
| private static int getMask(int strength) { |
| switch (strength) { |
| case Collator.PRIMARY: |
| return PRIMARYORDERMASK; |
| case Collator.SECONDARY: |
| return SECONDARYORDERMASK | PRIMARYORDERMASK; |
| default: |
| return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK; |
| } |
| } |
| |
| |
| // *** Boyer-Moore *** |
| /* |
| private final char getFCD(String str, int offset) { |
| char ch = str.charAt(offset); |
| if (ch < 0x180) { |
| return (char) nfcImpl_.getFCD16FromBelow180(ch); |
| } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { |
| if (!Character.isHighSurrogate(ch)) { |
| return (char) nfcImpl_.getFCD16FromNormData(ch); |
| } else { |
| char c2; |
| if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { |
| return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); |
| } |
| } |
| } |
| return 0; |
| } |
| |
| private final char getFCD(int c) { |
| return (char)nfcImpl_.getFCD16(c); |
| } |
| */ |
| |
| /** |
| * Getting the modified collation elements taking into account the collation |
| * attributes. |
| * |
| * @param sourcece |
| * @return the modified collation element |
| */ |
| private int getCE(int sourcece) { |
| // note for tertiary we can't use the collator->tertiaryMask, that |
| // is a preprocessed mask that takes into account case options. since |
| // we are only concerned with exact matches, we don't need that. |
| sourcece &= ceMask_; |
| |
| if (toShift_) { |
| // alternate handling here, since only the 16 most significant digits |
| // is only used, we can safely do a compare without masking |
| // if the ce is a variable, we mask and get only the primary values |
| // no shifting to quartenary is required since all primary values |
| // less than variabletop will need to be masked off anyway. |
| if (variableTop_ > sourcece) { |
| if (strength_ >= Collator.QUATERNARY) { |
| sourcece &= PRIMARYORDERMASK; |
| } else { |
| sourcece = CollationElementIterator.IGNORABLE; |
| } |
| } |
| } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) { |
| sourcece = 0xFFFF; |
| } |
| |
| return sourcece; |
| } |
| |
| /** |
| * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp |
| * (except not taking destination buffer size and status param). |
| * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should |
| * implement this in Pattern class. |
| * |
| * @param destination target array |
| * @param offset destination offset to add value |
| * @param value to be added |
| * @param increments incremental size expected |
| * @return new destination array, destination if there was no new allocation |
| */ |
| private static int[] addToIntArray(int[] destination, int offset, int value, int increments) { |
| int newlength = destination.length; |
| if (offset + 1 == newlength) { |
| newlength += increments; |
| int temp[] = new int[newlength]; |
| System.arraycopy(destination, 0, temp, 0, offset); |
| destination = temp; |
| } |
| destination[offset] = value; |
| return destination; |
| } |
| |
| /** |
| * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp. |
| * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should |
| * implement this in Pattern class. |
| * |
| * @param destination target array |
| * @param offset destination offset to add value |
| * @param destinationlength target array size |
| * @param value to be added |
| * @param increments incremental size expected |
| * @return new destination array, destination if there was no new allocation |
| */ |
| private static long[] addToLongArray(long[] destination, int offset, int destinationlength, |
| long value, int increments) { |
| int newlength = destinationlength; |
| if (offset + 1 == newlength) { |
| newlength += increments; |
| long temp[] = new long[newlength]; |
| System.arraycopy(destination, 0, temp, 0, offset); |
| destination = temp; |
| } |
| destination[offset] = value; |
| return destination; |
| } |
| |
| /** |
| * Initializing the ce table for a pattern. |
| * Stores non-ignorable collation keys. |
| * Table size will be estimated by the size of the pattern text. Table |
| * expansion will be perform as we go along. Adding 1 to ensure that the table |
| * size definitely increases. |
| * @return total number of expansions |
| */ |
| // TODO: We probably do not need Pattern CE table. |
| private int initializePatternCETable() { |
| int[] cetable = new int[INITIAL_ARRAY_SIZE_]; |
| int patternlength = pattern_.text_.length(); |
| CollationElementIterator coleiter = utilIter_; |
| |
| if (coleiter == null) { |
| coleiter = new CollationElementIterator(pattern_.text_, collator_); |
| utilIter_ = coleiter; |
| } else { |
| coleiter.setText(pattern_.text_); |
| } |
| |
| int offset = 0; |
| int result = 0; |
| int ce; |
| |
| while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) { |
| int newce = getCE(ce); |
| if (newce != CollationElementIterator.IGNORABLE /* 0 */) { |
| int[] temp = addToIntArray(cetable, offset, newce, |
| patternlength - coleiter.getOffset() + 1); |
| offset++; |
| cetable = temp; |
| } |
| result += (coleiter.getMaxExpansion(ce) - 1); |
| } |
| |
| cetable[offset] = 0; |
| pattern_.CE_ = cetable; |
| pattern_.CELength_ = offset; |
| |
| return result; |
| } |
| |
| /** |
| * Initializing the pce table for a pattern. |
| * Stores non-ignorable collation keys. |
| * Table size will be estimated by the size of the pattern text. Table |
| * expansion will be perform as we go along. Adding 1 to ensure that the table |
| * size definitely increases. |
| * @return total number of expansions |
| */ |
| private int initializePatternPCETable() { |
| long[] pcetable = new long[INITIAL_ARRAY_SIZE_]; |
| int pcetablesize = pcetable.length; |
| int patternlength = pattern_.text_.length(); |
| CollationElementIterator coleiter = utilIter_; |
| |
| if (coleiter == null) { |
| coleiter = new CollationElementIterator(pattern_.text_, collator_); |
| utilIter_ = coleiter; |
| } else { |
| coleiter.setText(pattern_.text_); |
| } |
| |
| int offset = 0; |
| int result = 0; |
| long pce; |
| |
| CollationPCE iter = new CollationPCE(coleiter); |
| |
| // ** Should processed CEs be signed or unsigned? |
| // ** (the rest of the code in this file seems to play fast-and-loose with |
| // ** whether a CE is signed or unsigned. For example, look at routine above this one.) |
| while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) { |
| long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1); |
| offset++; |
| pcetable = temp; |
| } |
| |
| pcetable[offset] = 0; |
| pattern_.PCE_ = pcetable; |
| pattern_.PCELength_ = offset; |
| |
| return result; |
| } |
| |
| // TODO: This method only triggers initializePatternCETable(), which is probably no |
| // longer needed. |
| private int initializePattern() { |
| // Since the strength is primary, accents are ignored in the pattern. |
| |
| // *** Boyer-Moore *** |
| /* |
| if (strength_ == Collator.PRIMARY) { |
| pattern_.hasPrefixAccents_ = false; |
| pattern_.hasSuffixAccents_ = false; |
| } else { |
| pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0; |
| pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0; |
| } |
| */ |
| |
| pattern_.PCE_ = null; |
| |
| // since intializePattern is an internal method status is a success. |
| return initializePatternCETable(); |
| } |
| |
| // *** Boyer-Moore *** |
| /* |
| private final void setShiftTable(char shift[], |
| char backshift[], |
| int cetable[], int cesize, |
| int expansionsize, |
| int defaultforward, |
| int defaultbackward) { |
| // No implementation |
| } |
| */ |
| |
| // TODO: This method only triggers initializePattern(), which is probably no |
| // longer needed. |
| private void initialize() { |
| /* int expandlength = */ initializePattern(); |
| |
| // *** Boyer-Moore *** |
| /* |
| if (pattern_.CELength_ > 0) { |
| int cesize = pattern_.CELength_; |
| int minlength = cesize > expandlength ? cesize - expandlength : 1; |
| pattern_.defaultShiftSize_ = minlength; |
| setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize, |
| expandlength, minlength, minlength); |
| return; |
| } |
| return pattern_.defaultShiftSize_; |
| */ |
| } |
| |
| /** |
| * @deprecated This API is ICU internal only. |
| * @hide original deprecated declaration |
| * @hide draft / provisional / internal are hidden on Android |
| */ |
| @Override |
| @Deprecated |
| protected void setMatchNotFound() { |
| super.setMatchNotFound(); |
| // SearchIterator#setMatchNotFound() does following: |
| // search_.matchedIndex_ = DONE; |
| // search_.setMatchedLength(0); |
| if (search_.isForwardSearching_) { |
| textIter_.setOffset(search_.text().getEndIndex()); |
| } else { |
| textIter_.setOffset(0); |
| } |
| } |
| |
| /** |
| * Checks if the offset runs out of the text string range |
| * @param textstart offset of the first character in the range |
| * @param textlimit limit offset of the text string range |
| * @param offset to test |
| * @return true if offset is out of bounds, false otherwise |
| */ |
| private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) { |
| return offset < textstart || offset > textlimit; |
| } |
| |
| /** |
| * Checks for identical match |
| * @param start offset of possible match |
| * @param end offset of possible match |
| * @return true if identical match is found |
| */ |
| private boolean checkIdentical(int start, int end) { |
| if (strength_ != Collator.IDENTICAL) { |
| return true; |
| } |
| // Note: We could use Normalizer::compare() or similar, but for short strings |
| // which may not be in FCD it might be faster to just NFD them. |
| String textstr = getString(targetText, start, end - start); |
| if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) { |
| textstr = Normalizer.decompose(textstr, false); |
| } |
| String patternstr = pattern_.text_; |
| if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) { |
| patternstr = Normalizer.decompose(patternstr, false); |
| } |
| return textstr.equals(patternstr); |
| } |
| |
| private boolean initTextProcessedIter() { |
| if (textProcessedIter_ == null) { |
| textProcessedIter_ = new CollationPCE(textIter_); |
| } else { |
| textProcessedIter_.init(textIter_); |
| } |
| return true; |
| } |
| |
| /* |
| * Find the next break boundary after startIndex. If the UStringSearch object |
| * has an external break iterator, use that. Otherwise use the internal character |
| * break iterator. |
| */ |
| private int nextBoundaryAfter(int startIndex) { |
| BreakIterator breakiterator = search_.breakIter(); |
| |
| if (breakiterator == null) { |
| breakiterator = search_.internalBreakIter_; |
| } |
| |
| if (breakiterator != null) { |
| return breakiterator.following(startIndex); |
| } |
| |
| return startIndex; |
| } |
| |
| /* |
| * Returns true if index is on a break boundary. If the UStringSearch |
| * has an external break iterator, test using that, otherwise test |
| * using the internal character break iterator. |
| */ |
| private boolean isBreakBoundary(int index) { |
| BreakIterator breakiterator = search_.breakIter(); |
| |
| if (breakiterator == null) { |
| breakiterator = search_.internalBreakIter_; |
| } |
| |
| return (breakiterator != null && breakiterator.isBoundary(index)); |
| } |
| |
| |
| // Java porting note: Followings are corresponding to UCompareCEsResult enum |
| private static final int CE_MATCH = -1; |
| private static final int CE_NO_MATCH = 0; |
| private static final int CE_SKIP_TARG = 1; |
| private static final int CE_SKIP_PATN = 2; |
| |
| private static int CE_LEVEL2_BASE = 0x00000005; |
| private static int CE_LEVEL3_BASE = 0x00050000; |
| |
| private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) { |
| if (targCE == patCE) { |
| return CE_MATCH; |
| } |
| if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { |
| return CE_NO_MATCH; |
| } |
| |
| long targCEshifted = targCE >>> 32; |
| long patCEshifted = patCE >>> 32; |
| long mask; |
| |
| mask = 0xFFFF0000L; |
| int targLev1 = (int)(targCEshifted & mask); |
| int patLev1 = (int)(patCEshifted & mask); |
| if (targLev1 != patLev1) { |
| if (targLev1 == 0) { |
| return CE_SKIP_TARG; |
| } |
| if (patLev1 == 0 |
| && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { |
| return CE_SKIP_PATN; |
| } |
| return CE_NO_MATCH; |
| } |
| |
| mask = 0x0000FFFFL; |
| int targLev2 = (int)(targCEshifted & mask); |
| int patLev2 = (int)(patCEshifted & mask); |
| if (targLev2 != patLev2) { |
| if (targLev2 == 0) { |
| return CE_SKIP_TARG; |
| } |
| if (patLev2 == 0 |
| && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { |
| return CE_SKIP_PATN; |
| } |
| return (patLev2 == CE_LEVEL2_BASE || |
| (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && |
| targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH; |
| } |
| |
| mask = 0xFFFF0000L; |
| int targLev3 = (int)(targCE & mask); |
| int patLev3 = (int)(patCE & mask); |
| if (targLev3 != patLev3) { |
| return (patLev3 == CE_LEVEL3_BASE || |
| (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && |
| targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH; |
| } |
| |
| return CE_MATCH; |
| } |
| |
| /** |
| * An object used for receiving matched index in search() and |
| * searchBackwards(). |
| */ |
| private static class Match { |
| int start_ = -1; |
| int limit_ = -1; |
| } |
| |
| private boolean search(int startIdx, Match m) { |
| // Input parameter sanity check. |
| if (pattern_.CELength_ == 0 |
| || startIdx < search_.beginIndex() |
| || startIdx > search_.endIndex()) { |
| throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " + |
| search_.beginIndex() + " and " + search_.endIndex()); |
| } |
| |
| if (pattern_.PCE_ == null) { |
| initializePatternPCETable(); |
| } |
| |
| textIter_.setOffset(startIdx); |
| CEBuffer ceb = new CEBuffer(this); |
| |
| int targetIx = 0; |
| CEI targetCEI = null; |
| int patIx; |
| boolean found; |
| |
| int mStart = -1; |
| int mLimit = -1; |
| int minLimit; |
| int maxLimit; |
| |
| // Outer loop moves over match starting positions in the |
| // target CE space. |
| // Here we see the target as a sequence of collation elements, resulting from the following: |
| // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied |
| // (for example, digraphs such as IJ may be broken into two characters). |
| // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next |
| // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these |
| // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t |
| // CE weight 0 (as for a combining diacritic with secondary weight when the collator strength is primary), |
| // then the CE is deleted, so the following code sees only CEs that are relevant. |
| // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. |
| // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text |
| // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). |
| for (targetIx = 0; ; targetIx++) { |
| found = true; |
| // Inner loop checks for a match beginning at each |
| // position from the outer loop. |
| int targetIxOffset = 0; |
| long patCE = 0; |
| // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer |
| // (compared to the last CE fetched for the previous targetIx value) as we need to go |
| // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. |
| CEI firstCEI = ceb.get(targetIx); |
| if (firstCEI == null) { |
| throw new ICUException("CEBuffer.get(" + targetIx + ") returned null."); |
| } |
| |
| for (patIx = 0; patIx < pattern_.PCELength_; patIx++) { |
| patCE = pattern_.PCE_[patIx]; |
| targetCEI = ceb.get(targetIx + patIx + targetIxOffset); |
| // Compare CE from target string with CE from the pattern. |
| // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, |
| // which will fail the compare, below. |
| int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); |
| if (ceMatch == CE_NO_MATCH) { |
| found = false; |
| break; |
| } else if (ceMatch > CE_NO_MATCH) { |
| if (ceMatch == CE_SKIP_TARG) { |
| // redo with same patCE, next targCE |
| patIx--; |
| targetIxOffset++; |
| } else { // ceMatch == CE_SKIP_PATN |
| // redo with same targCE, next patCE |
| targetIxOffset--; |
| } |
| } |
| } |
| targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far |
| |
| if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { |
| // No match at this targetIx. Try again at the next. |
| continue; |
| } |
| |
| if (!found) { |
| // No match at all, we have run off the end of the target text. |
| break; |
| } |
| |
| // We have found a match in CE space. |
| // Now determine the bounds in string index space. |
| // There still is a chance of match failure if the CE range not correspond to |
| // an acceptable character range. |
| // |
| CEI lastCEI = ceb.get(targetIx + targetIxOffset -1); |
| |
| mStart = firstCEI.lowIndex_; |
| minLimit = lastCEI.lowIndex_; |
| |
| // Look at the CE following the match. If it is UCOL_NULLORDER the match |
| // extended to the end of input, and the match is good. |
| |
| // Look at the high and low indices of the CE following the match. If |
| // they are the same it means one of two things: |
| // 1. The match extended to the last CE from the target text, which is OK, or |
| // 2. The last CE that was part of the match is in an expansion that extends |
| // to the first CE after the match. In this case, we reject the match. |
| CEI nextCEI = null; |
| if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { |
| nextCEI = ceb.get(targetIx + targetIxOffset); |
| maxLimit = nextCEI.lowIndex_; |
| if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { |
| found = false; |
| } |
| } else { |
| for (;; ++targetIxOffset) { |
| nextCEI = ceb.get(targetIx + targetIxOffset); |
| maxLimit = nextCEI.lowIndex_; |
| // If we are at the end of the target too, match succeeds |
| if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) { |
| break; |
| } |
| // As long as the next CE has primary weight of 0, |
| // it is part of the last target element matched by the pattern; |
| // make sure it can be part of a match with the last patCE |
| if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) { |
| int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_); |
| if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) { |
| found = false; |
| break; |
| } |
| // If lowIndex == highIndex, this target CE is part of an expansion of the last matched |
| // target element, but it has non-zero primary weight => match fails |
| } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) { |
| found = false; |
| break; |
| // Else the target CE is not part of an expansion of the last matched element, match succeeds |
| } else { |
| break; |
| } |
| } |
| } |
| |
| // Check for the start of the match being within a combining sequence. |
| // This can happen if the pattern itself begins with a combining char, and |
| // the match found combining marks in the target text that were attached |
| // to something else. |
| // This type of match should be rejected for not completely consuming a |
| // combining sequence. |
| if (!isBreakBoundary(mStart)) { |
| found = false; |
| } |
| |
| // Check for the start of the match being within an Collation Element Expansion, |
| // meaning that the first char of the match is only partially matched. |
| // With expansions, the first CE will report the index of the source |
| // character, and all subsequent (expansions) CEs will report the source index of the |
| // _following_ character. |
| int secondIx = firstCEI.highIndex_; |
| if (mStart == secondIx) { |
| found = false; |
| } |
| |
| // Allow matches to end in the middle of a grapheme cluster if the following |
| // conditions are met; this is needed to make prefix search work properly in |
| // Indic, see #11750 |
| // * the default breakIter is being used |
| // * the next collation element after this combining sequence |
| // - has non-zero primary weight |
| // - corresponds to a separate character following the one at end of the current match |
| // (the second of these conditions, and perhaps both, may be redundant given the |
| // subsequent check for normalization boundary; however they are likely much faster |
| // tests in any case) |
| // * the match limit is a normalization boundary |
| boolean allowMidclusterMatch = |
| breakIterator == null && |
| (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && |
| maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && |
| (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || |
| nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); |
| |
| // If those conditions are met, then: |
| // * do NOT advance the candidate match limit (mLimit) to a break boundary; however |
| // the match limit may be backed off to a previous break boundary. This handles |
| // cases in which mLimit includes target characters that are ignorable with current |
| // settings (such as space) and which extend beyond the pattern match. |
| // * do NOT require that end of the combining sequence not extend beyond the match in CE space |
| // * do NOT require that match limit be on a breakIter boundary |
| |
| // Advance the match end position to the first acceptable match boundary. |
| // This advances the index over any combining characters. |
| mLimit = maxLimit; |
| if (minLimit < maxLimit) { |
| // When the last CE's low index is same with its high index, the CE is likely |
| // a part of expansion. In this case, the index is located just after the |
| // character corresponding to the CEs compared above. If the index is right |
| // at the break boundary, move the position to the next boundary will result |
| // incorrect match length when there are ignorable characters exist between |
| // the position and the next character produces CE(s). See ticket#8482. |
| if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) { |
| mLimit = minLimit; |
| } else { |
| int nba = nextBoundaryAfter(minLimit); |
| // Note that we can have nba < maxLimit && nba >= minLImit, in which |
| // case we want to set mLimit to nba regardless of allowMidclusterMatch |
| // (i.e. we back off mLimit to the previous breakIterator boundary). |
| if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { |
| mLimit = nba; |
| } |
| } |
| } |
| |
| if (!allowMidclusterMatch) { |
| // If advancing to the end of a combining sequence in character indexing space |
| // advanced us beyond the end of the match in CE space, reject this match. |
| if (mLimit > maxLimit) { |
| found = false; |
| } |
| |
| if (!isBreakBoundary(mLimit)) { |
| found = false; |
| } |
| } |
| |
| if (!checkIdentical(mStart, mLimit)) { |
| found = false; |
| } |
| |
| if (found) { |
| break; |
| } |
| } |
| |
| // All Done. Store back the match bounds to the caller. |
| // |
| if (found == false) { |
| mLimit = -1; |
| mStart = -1; |
| } |
| |
| if (m != null) { |
| m.start_ = mStart; |
| m.limit_ = mLimit; |
| } |
| |
| return found; |
| } |
| |
| private static int codePointAt(CharacterIterator iter, int index) { |
| int currentIterIndex = iter.getIndex(); |
| char codeUnit = iter.setIndex(index); |
| int cp = codeUnit; |
| if (Character.isHighSurrogate(codeUnit)) { |
| char nextUnit = iter.next(); |
| if (Character.isLowSurrogate(nextUnit)) { |
| cp = Character.toCodePoint(codeUnit, nextUnit); |
| } |
| } |
| iter.setIndex(currentIterIndex); // restore iter position |
| return cp; |
| } |
| |
| private static int codePointBefore(CharacterIterator iter, int index) { |
| int currentIterIndex = iter.getIndex(); |
| iter.setIndex(index); |
| char codeUnit = iter.previous(); |
| int cp = codeUnit; |
| if (Character.isLowSurrogate(codeUnit)) { |
| char prevUnit = iter.previous(); |
| if (Character.isHighSurrogate(prevUnit)) { |
| cp = Character.toCodePoint(prevUnit, codeUnit); |
| } |
| } |
| iter.setIndex(currentIterIndex); // restore iter position |
| return cp; |
| } |
| |
| private boolean searchBackwards(int startIdx, Match m) { |
| //ICU4C_TODO comment: reject search patterns beginning with a combining char. |
| |
| // Input parameter sanity check. |
| if (pattern_.CELength_ == 0 |
| || startIdx < search_.beginIndex() |
| || startIdx > search_.endIndex()) { |
| throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " + |
| search_.beginIndex() + " and " + search_.endIndex()); |
| } |
| |
| if (pattern_.PCE_ == null) { |
| initializePatternPCETable(); |
| } |
| |
| CEBuffer ceb = new CEBuffer(this); |
| int targetIx = 0; |
| |
| /* |
| * Pre-load the buffer with the CE's for the grapheme |
| * after our starting position so that we're sure that |
| * we can look at the CE following the match when we |
| * check the match boundaries. |
| * |
| * This will also pre-fetch the first CE that we'll |
| * consider for the match. |
| */ |
| if (startIdx < search_.endIndex()) { |
| BreakIterator bi = search_.internalBreakIter_; |
| int next = bi.following(startIdx); |
| |
| textIter_.setOffset(next); |
| |
| for (targetIx = 0; ; targetIx++) { |
| if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) { |
| break; |
| } |
| } |
| } else { |
| textIter_.setOffset(startIdx); |
| } |
| |
| CEI targetCEI = null; |
| int patIx; |
| boolean found; |
| |
| int limitIx = targetIx; |
| int mStart = -1; |
| int mLimit = -1; |
| int minLimit; |
| int maxLimit; |
| |
| // Outer loop moves over match starting positions in the |
| // target CE space. |
| // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). |
| // But patIx is 0 at the beginning of the pattern and increases toward the end. |
| // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern |
| // and the beginning of the base text. |
| for (targetIx = limitIx; ; targetIx++) { |
| found = true; |
| // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer |
| // (compared to the last CE fetched for the previous targetIx value) as we need to go |
| // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. |
| CEI lastCEI = ceb.getPrevious(targetIx); |
| if (lastCEI == null) { |
| throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null."); |
| } |
| // Inner loop checks for a match beginning at each |
| // position from the outer loop. |
| int targetIxOffset = 0; |
| for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) { |
| long patCE = pattern_.PCE_[patIx]; |
| |
| targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset); |
| // Compare CE from target string with CE from the pattern. |
| // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, |
| // which will fail the compare, below. |
| int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); |
| if (ceMatch == CE_NO_MATCH) { |
| found = false; |
| break; |
| } else if (ceMatch > CE_NO_MATCH) { |
| if (ceMatch == CE_SKIP_TARG) { |
| // redo with same patCE, next targCE |
| patIx++; |
| targetIxOffset++; |
| } else { // ceMatch == CE_SKIP_PATN |
| // redo with same targCE, next patCE |
| targetIxOffset--; |
| } |
| } |
| } |
| |
| if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { |
| // No match at this targetIx. Try again at the next. |
| continue; |
| } |
| |
| if (!found) { |
| // No match at all, we have run off the end of the target text. |
| break; |
| } |
| |
| // We have found a match in CE space. |
| // Now determine the bounds in string index space. |
| // There still is a chance of match failure if the CE range not correspond to |
| // an acceptable character range. |
| // |
| CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset); |
| mStart = firstCEI.lowIndex_; |
| |
| // Check for the start of the match being within a combining sequence. |
| // This can happen if the pattern itself begins with a combining char, and |
| // the match found combining marks in the target text that were attached |
| // to something else. |
| // This type of match should be rejected for not completely consuming a |
| // combining sequence. |
| if (!isBreakBoundary(mStart)) { |
| found = false; |
| } |
| |
| // Look at the high index of the first CE in the match. If it's the same as the |
| // low index, the first CE in the match is in the middle of an expansion. |
| if (mStart == firstCEI.highIndex_) { |
| found = false; |
| } |
| |
| minLimit = lastCEI.lowIndex_; |
| |
| if (targetIx > 0) { |
| // Look at the CE following the match. If it is UCOL_NULLORDER the match |
| // extended to the end of input, and the match is good. |
| |
| // Look at the high and low indices of the CE following the match. If |
| // they are the same it means one of two things: |
| // 1. The match extended to the last CE from the target text, which is OK, or |
| // 2. The last CE that was part of the match is in an expansion that extends |
| // to the first CE after the match. In this case, we reject the match. |
| CEI nextCEI = ceb.getPrevious(targetIx - 1); |
| |
| if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { |
| found = false; |
| } |
| |
| mLimit = maxLimit = nextCEI.lowIndex_; |
| |
| // Allow matches to end in the middle of a grapheme cluster if the following |
| // conditions are met; this is needed to make prefix search work properly in |
| // Indic, see #11750 |
| // * the default breakIter is being used |
| // * the next collation element after this combining sequence |
| // - has non-zero primary weight |
| // - corresponds to a separate character following the one at end of the current match |
| // (the second of these conditions, and perhaps both, may be redundant given the |
| // subsequent check for normalization boundary; however they are likely much faster |
| // tests in any case) |
| // * the match limit is a normalization boundary |
| boolean allowMidclusterMatch = |
| breakIterator == null && |
| (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && |
| maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && |
| (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || |
| nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); |
| |
| // If those conditions are met, then: |
| // * do NOT advance the candidate match limit (mLimit) to a break boundary; however |
| // the match limit may be backed off to a previous break boundary. This handles |
| // cases in which mLimit includes target characters that are ignorable with current |
| // settings (such as space) and which extend beyond the pattern match. |
| // * do NOT require that end of the combining sequence not extend beyond the match in CE space |
| // * do NOT require that match limit be on a breakIter boundary |
| |
| // Advance the match end position to the first acceptable match boundary. |
| // This advances the index over any combining characters. |
| if (minLimit < maxLimit) { |
| int nba = nextBoundaryAfter(minLimit); |
| // Note that we can have nba < maxLimit && nba >= minLImit, in which |
| // case we want to set mLimit to nba regardless of allowMidclusterMatch |
| // (i.e. we back off mLimit to the previous breakIterator boundary). |
| if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { |
| mLimit = nba; |
| } |
| } |
| |
| if (!allowMidclusterMatch) { |
| // If advancing to the end of a combining sequence in character indexing space |
| // advanced us beyond the end of the match in CE space, reject this match. |
| if (mLimit > maxLimit) { |
| found = false; |
| } |
| |
| // Make sure the end of the match is on a break boundary |
| if (!isBreakBoundary(mLimit)) { |
| found = false; |
| } |
| } |
| |
| } else { |
| // No non-ignorable CEs after this point. |
| // The maximum position is detected by boundary after |
| // the last non-ignorable CE. Combining sequence |
| // across the start index will be truncated. |
| int nba = nextBoundaryAfter(minLimit); |
| mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; |
| } |
| |
| if (!checkIdentical(mStart, mLimit)) { |
| found = false; |
| } |
| |
| if (found) { |
| break; |
| } |
| } |
| |
| // All Done. Store back the match bounds to the caller. |
| // |
| if (found == false) { |
| mLimit = -1; |
| mStart = -1; |
| } |
| |
| if (m != null) { |
| m.start_ = mStart; |
| m.limit_ = mLimit; |
| } |
| |
| return found; |
| } |
| |
| // Java porting note: |
| // |
| // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical() |
| // for the linear search implementation. The differences are addressed in search(). |
| // |
| private boolean handleNextExact() { |
| return handleNextCommonImpl(); |
| } |
| |
| private boolean handleNextCanonical() { |
| return handleNextCommonImpl(); |
| } |
| |
| private boolean handleNextCommonImpl() { |
| int textOffset = textIter_.getOffset(); |
| Match match = new Match(); |
| |
| if (search(textOffset, match)) { |
| search_.matchedIndex_ = match.start_; |
| search_.setMatchedLength(match.limit_ - match.start_); |
| return true; |
| } else { |
| setMatchNotFound(); |
| return false; |
| } |
| } |
| |
| // Java porting note: |
| // |
| // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical() |
| // for the linear search implementation. The differences are addressed in searchBackwards(). |
| // |
| private boolean handlePreviousExact() { |
| return handlePreviousCommonImpl(); |
| } |
| |
| private boolean handlePreviousCanonical() { |
| return handlePreviousCommonImpl(); |
| } |
| |
| private boolean handlePreviousCommonImpl() { |
| int textOffset; |
| |
| if (search_.isOverlap_) { |
| if (search_.matchedIndex_ != DONE) { |
| textOffset = search_.matchedIndex_ + search_.matchedLength() - 1; |
| } else { |
| // move the start position at the end of possible match |
| initializePatternPCETable(); |
| if (!initTextProcessedIter()) { |
| setMatchNotFound(); |
| return false; |
| } |
| for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) { |
| long pce = textProcessedIter_.nextProcessed(null); |
| if (pce == CollationPCE.PROCESSED_NULLORDER) { |
| // at the end of the text |
| break; |
| } |
| } |
| textOffset = textIter_.getOffset(); |
| } |
| } else { |
| textOffset = textIter_.getOffset(); |
| } |
| |
| Match match = new Match(); |
| if (searchBackwards(textOffset, match)) { |
| search_.matchedIndex_ = match.start_; |
| search_.setMatchedLength(match.limit_ - match.start_); |
| return true; |
| } else { |
| setMatchNotFound(); |
| return false; |
| } |
| } |
| |
| /** |
| * Gets a substring out of a CharacterIterator |
| * |
| * Java porting note: Not available in ICU4C |
| * |
| * @param text CharacterIterator |
| * @param start start offset |
| * @param length of substring |
| * @return substring from text starting at start and length length |
| */ |
| private static final String getString(CharacterIterator text, int start, int length) { |
| StringBuilder result = new StringBuilder(length); |
| int offset = text.getIndex(); |
| text.setIndex(start); |
| for (int i = 0; i < length; i++) { |
| result.append(text.current()); |
| text.next(); |
| } |
| text.setIndex(offset); |
| return result.toString(); |
| } |
| |
| /** |
| * Java port of ICU4C struct UPattern (usrchimp.h) |
| */ |
| private static final class Pattern { |
| /** Pattern string */ |
| String text_; |
| |
| long[] PCE_; |
| int PCELength_ = 0; |
| |
| // TODO: We probably do not need CE_ / CELength_ |
| @SuppressWarnings("unused") |
| int[] CE_; |
| int CELength_ = 0; |
| |
| // *** Boyer-Moore *** |
| // boolean hasPrefixAccents_ = false; |
| // boolean hasSuffixAccents_ = false; |
| // int defaultShiftSize_; |
| // char[] shift_; |
| // char[] backShift_; |
| |
| protected Pattern(String pattern) { |
| text_ = pattern; |
| } |
| } |
| |
| /** |
| * Java port of ICU4C UCollationPCE (usrchimp.h) |
| */ |
| private static class CollationPCE { |
| public static final long PROCESSED_NULLORDER = -1; |
| |
| private static final int DEFAULT_BUFFER_SIZE = 16; |
| private static final int BUFFER_GROW = 8; |
| |
| // Note: PRIMARYORDERMASK is also duplicated in StringSearch class |
| private static final int PRIMARYORDERMASK = 0xffff0000; |
| private static final int CONTINUATION_MARKER = 0xc0; |
| |
| private PCEBuffer pceBuffer_ = new PCEBuffer(); |
| private CollationElementIterator cei_; |
| private int strength_; |
| private boolean toShift_; |
| private boolean isShifted_; |
| private int variableTop_; |
| |
| public CollationPCE(CollationElementIterator iter) { |
| init(iter); |
| } |
| |
| public void init(CollationElementIterator iter) { |
| cei_ = iter; |
| init(iter.getRuleBasedCollator()); |
| } |
| |
| private void init(RuleBasedCollator coll) { |
| strength_ = coll.getStrength(); |
| toShift_ = coll.isAlternateHandlingShifted(); |
| isShifted_ = false; |
| variableTop_ = coll.getVariableTop(); |
| } |
| |
| @SuppressWarnings("fallthrough") |
| private long processCE(int ce) { |
| long primary = 0, secondary = 0, tertiary = 0, quaternary = 0; |
| |
| // This is clean, but somewhat slow... |
| // We could apply the mask to ce and then |
| // just get all three orders... |
| switch (strength_) { |
| default: |
| tertiary = CollationElementIterator.tertiaryOrder(ce); |
| /* note fall-through */ |
| |
| case Collator.SECONDARY: |
| secondary = CollationElementIterator.secondaryOrder(ce); |
| /* note fall-through */ |
| |
| case Collator.PRIMARY: |
| primary = CollationElementIterator.primaryOrder(ce); |
| } |
| |
| // **** This should probably handle continuations too. **** |
| // **** That means that we need 24 bits for the primary **** |
| // **** instead of the 16 that we're currently using. **** |
| // **** So we can lay out the 64 bits as: 24.12.12.16. **** |
| // **** Another complication with continuations is that **** |
| // **** the *second* CE is marked as a continuation, so **** |
| // **** we always have to peek ahead to know how long **** |
| // **** the primary is... **** |
| if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) { |
| |
| if (primary == 0) { |
| return CollationElementIterator.IGNORABLE; |
| } |
| |
| if (strength_ >= Collator.QUATERNARY) { |
| quaternary = primary; |
| } |
| |
| primary = secondary = tertiary = 0; |
| isShifted_ = true; |
| } else { |
| if (strength_ >= Collator.QUATERNARY) { |
| quaternary = 0xFFFF; |
| } |
| |
| isShifted_ = false; |
| } |
| |
| return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; |
| } |
| |
| /** |
| * Get the processed ordering priority of the next collation element in the text. |
| * A single character may contain more than one collation element. |
| * |
| * Note: This is equivalent to |
| * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); |
| * |
| * @param range receiving the iterator index before/after fetching the CE. |
| * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER |
| * if an error has occurred or if the end of string has been reached |
| */ |
| public long nextProcessed(Range range) { |
| long result = CollationElementIterator.IGNORABLE; |
| int low = 0, high = 0; |
| |
| pceBuffer_.reset(); |
| |
| do { |
| low = cei_.getOffset(); |
| int ce = cei_.next(); |
| high = cei_.getOffset(); |
| |
| if (ce == CollationElementIterator.NULLORDER) { |
| result = PROCESSED_NULLORDER; |
| break; |
| } |
| |
| result = processCE(ce); |
| } while (result == CollationElementIterator.IGNORABLE); |
| |
| if (range != null) { |
| range.ixLow_ = low; |
| range.ixHigh_ = high; |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Get the processed ordering priority of the previous collation element in the text. |
| * A single character may contain more than one collation element. |
| * |
| * Note: This is equivalent to |
| * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); |
| * |
| * @param range receiving the iterator index before/after fetching the CE. |
| * @return The previous collation elements ordering, otherwise returns |
| * PROCESSED_NULLORDER if an error has occurred or if the start of |
| * string has been reached. |
| */ |
| public long previousProcessed(Range range) { |
| long result = CollationElementIterator.IGNORABLE; |
| int low = 0, high = 0; |
| |
| // pceBuffer_.reset(); |
| |
| while (pceBuffer_.empty()) { |
| // buffer raw CEs up to non-ignorable primary |
| RCEBuffer rceb = new RCEBuffer(); |
| int ce; |
| |
| boolean finish = false; |
| |
| // **** do we need to reset rceb, or will it always be empty at this point **** |
| do { |
| high = cei_.getOffset(); |
| ce = cei_.previous(); |
| low = cei_.getOffset(); |
| |
| if (ce == CollationElementIterator.NULLORDER) { |
| if (!rceb.empty()) { |
| break; |
| } |
| |
| finish = true; |
| break; |
| } |
| |
| rceb.put(ce, low, high); |
| } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce)); |
| |
| if (finish) { |
| break; |
| } |
| |
| // process the raw CEs |
| while (!rceb.empty()) { |
| RCEI rcei = rceb.get(); |
| |
| result = processCE(rcei.ce_); |
| |
| if (result != CollationElementIterator.IGNORABLE) { |
| pceBuffer_.put(result, rcei.low_, rcei.high_); |
| } |
| } |
| } |
| |
| if (pceBuffer_.empty()) { |
| // **** Is -1 the right value for ixLow, ixHigh? **** |
| if (range != null) { |
| range.ixLow_ = -1; |
| range.ixHigh_ = -1; |
| } |
| return CollationElementIterator.NULLORDER; |
| } |
| |
| PCEI pcei = pceBuffer_.get(); |
| |
| if (range != null) { |
| range.ixLow_ = pcei.low_; |
| range.ixHigh_ = pcei.high_; |
| } |
| |
| return pcei.ce_; |
| } |
| |
| private static boolean isContinuation(int ce) { |
| return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER); |
| } |
| |
| /** |
| * @hide Only a subset of ICU is exposed in Android |
| */ |
| public static final class Range { |
| int ixLow_; |
| int ixHigh_; |
| } |
| |
| /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */ |
| private static final class PCEI { |
| long ce_; |
| int low_; |
| int high_; |
| } |
| |
| private static final class PCEBuffer { |
| private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE]; |
| private int bufferIndex_ = 0; |
| |
| void reset() { |
| bufferIndex_ = 0; |
| } |
| |
| boolean empty() { |
| return bufferIndex_ <= 0; |
| } |
| |
| void put(long ce, int ixLow, int ixHigh) |
| { |
| if (bufferIndex_ >= buffer_.length) { |
| PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW]; |
| System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); |
| buffer_ = newBuffer; |
| } |
| buffer_[bufferIndex_] = new PCEI(); |
| buffer_[bufferIndex_].ce_ = ce; |
| buffer_[bufferIndex_].low_ = ixLow; |
| buffer_[bufferIndex_].high_ = ixHigh; |
| |
| bufferIndex_ += 1; |
| } |
| |
| PCEI get() { |
| if (bufferIndex_ > 0) { |
| return buffer_[--bufferIndex_]; |
| } |
| return null; |
| } |
| } |
| |
| /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */ |
| private static final class RCEI { |
| int ce_; |
| int low_; |
| int high_; |
| } |
| |
| private static final class RCEBuffer { |
| private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE]; |
| private int bufferIndex_ = 0; |
| |
| boolean empty() { |
| return bufferIndex_ <= 0; |
| } |
| |
| void put(int ce, int ixLow, int ixHigh) { |
| if (bufferIndex_ >= buffer_.length) { |
| RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW]; |
| System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); |
| buffer_ = newBuffer; |
| } |
| buffer_[bufferIndex_] = new RCEI(); |
| buffer_[bufferIndex_].ce_ = ce; |
| buffer_[bufferIndex_].low_ = ixLow; |
| buffer_[bufferIndex_].high_ = ixHigh; |
| |
| bufferIndex_ += 1; |
| } |
| |
| RCEI get() { |
| if (bufferIndex_ > 0) { |
| return buffer_[--bufferIndex_]; |
| } |
| return null; |
| } |
| } |
| } |
| |
| /** |
| * Java port of ICU4C CEI (usearch.cpp) |
| * |
| * CEI Collation Element + source text index. |
| * These structs are kept in the circular buffer. |
| */ |
| private static class CEI { |
| long ce_; |
| int lowIndex_; |
| int highIndex_; |
| } |
| |
| /** |
| * CEBuffer A circular buffer of CEs from the text being searched |
| */ |
| private static class CEBuffer { |
| // Java porting note: ICU4C uses the size for stack buffer |
| // static final int DEFAULT_CEBUFFER_SIZE = 96; |
| |
| static final int CEBUFFER_EXTRA = 32; |
| static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8; |
| static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3; |
| |
| CEI[] buf_; |
| int bufSize_; |
| int firstIx_; |
| int limitIx_; |
| |
| // Java porting note: No references in ICU4C implementation |
| // CollationElementIterator ceIter_; |
| |
| StringSearch strSearch_; |
| |
| CEBuffer(StringSearch ss) { |
| strSearch_ = ss; |
| bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA; |
| if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { |
| String patText = ss.pattern_.text_; |
| if (patText != null) { |
| for (int i = 0; i < patText.length(); i++) { |
| char c = patText.charAt(i); |
| if (MIGHT_BE_JAMO_L(c)) { |
| bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; |
| } else { |
| // No check for surrogates, we might allocate slightly more buffer than necessary. |
| bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; |
| } |
| } |
| } |
| } |
| |
| // Not used - see above |
| // ceIter_ = ss.textIter_; |
| |
| firstIx_ = 0; |
| limitIx_ = 0; |
| |
| if (!ss.initTextProcessedIter()) { |
| return; |
| } |
| |
| buf_ = new CEI[bufSize_]; |
| } |
| |
| // Get the CE with the specified index. |
| // Index must be in the range |
| // n-history_size < index < n+1 |
| // where n is the largest index to have been fetched by some previous call to this function. |
| // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. |
| // |
| CEI get(int index) { |
| int i = index % bufSize_; |
| |
| if (index >= firstIx_ && index < limitIx_) { |
| // The request was for an entry already in our buffer. |
| // Just return it. |
| return buf_[i]; |
| } |
| |
| // Caller is requesting a new, never accessed before, CE. |
| // Verify that it is the next one in sequence, which is all |
| // that is allowed. |
| if (index != limitIx_) { |
| assert(false); |
| return null; |
| } |
| |
| // Manage the circular CE buffer indexing |
| limitIx_++; |
| |
| if (limitIx_ - firstIx_ >= bufSize_) { |
| // The buffer is full, knock out the lowest-indexed entry. |
| firstIx_++; |
| } |
| |
| CollationPCE.Range range = new CollationPCE.Range(); |
| if (buf_[i] == null) { |
| buf_[i] = new CEI(); |
| } |
| buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range); |
| buf_[i].lowIndex_ = range.ixLow_; |
| buf_[i].highIndex_ = range.ixHigh_; |
| |
| return buf_[i]; |
| } |
| |
| // Get the CE with the specified index. |
| // Index must be in the range |
| // n-history_size < index < n+1 |
| // where n is the largest index to have been fetched by some previous call to this function. |
| // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. |
| // |
| CEI getPrevious(int index) { |
| int i = index % bufSize_; |
| |
| if (index >= firstIx_ && index < limitIx_) { |
| // The request was for an entry already in our buffer. |
| // Just return it. |
| return buf_[i]; |
| } |
| |
| // Caller is requesting a new, never accessed before, CE. |
| // Verify that it is the next one in sequence, which is all |
| // that is allowed. |
| if (index != limitIx_) { |
| assert(false); |
| return null; |
| } |
| |
| // Manage the circular CE buffer indexing |
| limitIx_++; |
| |
| if (limitIx_ - firstIx_ >= bufSize_) { |
| // The buffer is full, knock out the lowest-indexed entry. |
| firstIx_++; |
| } |
| |
| CollationPCE.Range range = new CollationPCE.Range(); |
| if (buf_[i] == null) { |
| buf_[i] = new CEI(); |
| } |
| buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range); |
| buf_[i].lowIndex_ = range.ixLow_; |
| buf_[i].highIndex_ = range.ixHigh_; |
| |
| return buf_[i]; |
| } |
| |
| static boolean MIGHT_BE_JAMO_L(char c) { |
| return (c >= 0x1100 && c <= 0x115E) |
| || (c >= 0x3131 && c <= 0x314E) |
| || (c >= 0x3165 && c <= 0x3186); |
| } |
| } |
| } |