| /* |
| * Copyright (C) 2013 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package android.text; |
| |
| import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR; |
| |
| import android.annotation.Nullable; |
| import android.view.View; |
| |
| import com.android.internal.annotations.VisibleForTesting; |
| |
| import java.util.Locale; |
| |
| /** |
| * Utility class for formatting text for display in a potentially opposite-directionality context |
| * without garbling. The directionality of the context is set at formatter creation and the |
| * directionality of the text can be either estimated or passed in when known. |
| * |
| * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2}, |
| * you can use the support library's {@link androidx.core.text.BidiFormatter} class. |
| * |
| * <p>These APIs provides the following functionality: |
| * <p> |
| * 1. Bidi Wrapping |
| * When text in one language is mixed into a document in another, opposite-directionality language, |
| * e.g. when an English business name is embedded in some Hebrew text, both the inserted string |
| * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly |
| * separated from the surrounding text in a "wrapper" that: |
| * <p> |
| * - Declares its directionality so that the string is displayed correctly. This can be done in |
| * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. |
| * <p> |
| * - Isolates the string's directionality, so it does not unduly affect the surrounding content. |
| * Currently, this can only be done using invisible Unicode characters of the same direction as |
| * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" |
| * the directionality to that of the context. The "reset" may need to be done at both ends of the |
| * string. Without "reset" after the string, the string will "stick" to a number or logically |
| * separate opposite-direction text that happens to follow it in-line (even if separated by |
| * neutral content like spaces and punctuation). Without "reset" before the string, the same can |
| * happen there, but only with more opposite-direction text, not a number. One approach is to |
| * "reset" the direction only after each string, on the theory that if the preceding opposite- |
| * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing |
| * the "reset" only before each string definitely does not work because we do not want to require |
| * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a |
| * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL |
| * message translations often contain untranslated Latin-script brand names and technical terms, |
| * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one |
| * has such a message, it is best to do the "reset" manually in the message translation itself, |
| * since the message's opposite-direction text could be followed by an inserted number, which we |
| * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an |
| * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the |
| * isolation to be part of the directionality declaration. This form of isolation is better than |
| * "reset" because it takes less space, does not require knowing the context directionality, has a |
| * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow |
| * using it because required platforms do not yet support it. |
| * <p> |
| * Providing these wrapping services is the basic purpose of the bidi formatter. |
| * <p> |
| * 2. Directionality estimation |
| * How does one know whether a string about to be inserted into surrounding text has the same |
| * directionality? Well, in many cases, one knows that this must be the case when writing the code |
| * doing the insertion, e.g. when a localized message is inserted into a localized page. In such |
| * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be |
| * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. |
| * In the remaining cases, e.g. when the string is user-entered or comes from a database, the |
| * language of the string (and thus its directionality) is not known a priori, and must be |
| * estimated at run-time. The bidi formatter can do this automatically using the default |
| * first-strong estimation algorithm. It can also be configured to use a custom directionality |
| * estimation object. |
| */ |
| public final class BidiFormatter { |
| |
| /** |
| * The default text direction heuristic. |
| */ |
| private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; |
| |
| /** |
| * Unicode "Left-To-Right Embedding" (LRE) character. |
| */ |
| private static final char LRE = '\u202A'; |
| |
| /** |
| * Unicode "Right-To-Left Embedding" (RLE) character. |
| */ |
| private static final char RLE = '\u202B'; |
| |
| /** |
| * Unicode "Pop Directional Formatting" (PDF) character. |
| */ |
| private static final char PDF = '\u202C'; |
| |
| /** |
| * Unicode "Left-To-Right Mark" (LRM) character. |
| */ |
| private static final char LRM = '\u200E'; |
| |
| /* |
| * Unicode "Right-To-Left Mark" (RLM) character. |
| */ |
| private static final char RLM = '\u200F'; |
| |
| /* |
| * String representation of LRM |
| */ |
| private static final String LRM_STRING = Character.toString(LRM); |
| |
| /* |
| * String representation of RLM |
| */ |
| private static final String RLM_STRING = Character.toString(RLM); |
| |
| /** |
| * Empty string constant. |
| */ |
| private static final String EMPTY_STRING = ""; |
| |
| /** |
| * A class for building a BidiFormatter with non-default options. |
| */ |
| public static final class Builder { |
| private boolean mIsRtlContext; |
| private int mFlags; |
| private TextDirectionHeuristic mTextDirectionHeuristic; |
| |
| /** |
| * Constructor. |
| * |
| */ |
| public Builder() { |
| initialize(isRtlLocale(Locale.getDefault())); |
| } |
| |
| /** |
| * Constructor. |
| * |
| * @param rtlContext Whether the context directionality is RTL. |
| */ |
| public Builder(boolean rtlContext) { |
| initialize(rtlContext); |
| } |
| |
| /** |
| * Constructor. |
| * |
| * @param locale The context locale. |
| */ |
| public Builder(Locale locale) { |
| initialize(isRtlLocale(locale)); |
| } |
| |
| /** |
| * Initializes the builder with the given context directionality and default options. |
| * |
| * @param isRtlContext Whether the context is RTL or not. |
| */ |
| private void initialize(boolean isRtlContext) { |
| mIsRtlContext = isRtlContext; |
| mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC; |
| mFlags = DEFAULT_FLAGS; |
| } |
| |
| /** |
| * Specifies whether the BidiFormatter to be built should also "reset" directionality before |
| * a string being bidi-wrapped, not just after it. The default is true. |
| */ |
| public Builder stereoReset(boolean stereoReset) { |
| if (stereoReset) { |
| mFlags |= FLAG_STEREO_RESET; |
| } else { |
| mFlags &= ~FLAG_STEREO_RESET; |
| } |
| return this; |
| } |
| |
| /** |
| * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. |
| * By default, uses the first-strong heuristic. |
| * |
| * @param heuristic the {@code TextDirectionHeuristic} to use. |
| * @return the builder itself. |
| */ |
| public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) { |
| mTextDirectionHeuristic = heuristic; |
| return this; |
| } |
| |
| /** |
| * @return A BidiFormatter with the specified options. |
| */ |
| public BidiFormatter build() { |
| if (mFlags == DEFAULT_FLAGS && |
| mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) { |
| return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext); |
| } |
| return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic); |
| } |
| } |
| |
| // |
| private static final int FLAG_STEREO_RESET = 2; |
| private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; |
| |
| private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( |
| false /* LTR context */, |
| DEFAULT_FLAGS, |
| DEFAULT_TEXT_DIRECTION_HEURISTIC); |
| |
| private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( |
| true /* RTL context */, |
| DEFAULT_FLAGS, |
| DEFAULT_TEXT_DIRECTION_HEURISTIC); |
| |
| private final boolean mIsRtlContext; |
| private final int mFlags; |
| private final TextDirectionHeuristic mDefaultTextDirectionHeuristic; |
| |
| /** |
| * Factory for creating an instance of BidiFormatter for the default locale directionality. |
| * |
| * This does not create any new objects, and returns already existing static instances. |
| * |
| */ |
| public static BidiFormatter getInstance() { |
| return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault())); |
| } |
| |
| /** |
| * Factory for creating an instance of BidiFormatter given the context directionality. |
| * |
| * This does not create any new objects, and returns already existing static instances. |
| * |
| * @param rtlContext Whether the context directionality is RTL. |
| */ |
| public static BidiFormatter getInstance(boolean rtlContext) { |
| return getDefaultInstanceFromContext(rtlContext); |
| } |
| |
| /** |
| * Factory for creating an instance of BidiFormatter given the context locale. |
| * |
| * This does not create any new objects, and returns already existing static instances. |
| * |
| * @param locale The context locale. |
| */ |
| public static BidiFormatter getInstance(Locale locale) { |
| return getDefaultInstanceFromContext(isRtlLocale(locale)); |
| } |
| |
| /** |
| * @param isRtlContext Whether the context directionality is RTL or not. |
| * @param flags The option flags. |
| * @param heuristic The default text direction heuristic. |
| */ |
| private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) { |
| mIsRtlContext = isRtlContext; |
| mFlags = flags; |
| mDefaultTextDirectionHeuristic = heuristic; |
| } |
| |
| /** |
| * @return Whether the context directionality is RTL |
| */ |
| public boolean isRtlContext() { |
| return mIsRtlContext; |
| } |
| |
| /** |
| * @return Whether directionality "reset" should also be done before a string being |
| * bidi-wrapped, not just after it. |
| */ |
| public boolean getStereoReset() { |
| return (mFlags & FLAG_STEREO_RESET) != 0; |
| } |
| |
| /** |
| * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the |
| * overall or the exit directionality of a given string is opposite to the context directionality. |
| * Putting this after the string (including its directionality declaration wrapping) prevents it |
| * from "sticking" to other opposite-directionality text or a number appearing after it inline |
| * with only neutral content in between. Otherwise returns the empty string. While the exit |
| * directionality is determined by scanning the end of the string, the overall directionality is |
| * given explicitly by a heuristic to estimate the {@code str}'s directionality. |
| * |
| * @param str CharSequence after which the mark may need to appear. |
| * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s |
| * directionality. |
| * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; |
| * else, the empty string. |
| * |
| * @hide |
| */ |
| public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) { |
| final boolean isRtl = heuristic.isRtl(str, 0, str.length()); |
| // getExitDir() is called only if needed (short-circuit). |
| if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { |
| return LRM_STRING; |
| } |
| if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { |
| return RLM_STRING; |
| } |
| return EMPTY_STRING; |
| } |
| |
| /** |
| * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the |
| * overall or the entry directionality of a given string is opposite to the context |
| * directionality. Putting this before the string (including its directionality declaration |
| * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before |
| * it inline with only neutral content in between. Otherwise returns the empty string. While the |
| * entry directionality is determined by scanning the beginning of the string, the overall |
| * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. |
| * |
| * @param str CharSequence before which the mark may need to appear. |
| * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s |
| * directionality. |
| * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; |
| * else, the empty string. |
| * |
| * @hide |
| */ |
| public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) { |
| final boolean isRtl = heuristic.isRtl(str, 0, str.length()); |
| // getEntryDir() is called only if needed (short-circuit). |
| if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { |
| return LRM_STRING; |
| } |
| if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { |
| return RLM_STRING; |
| } |
| return EMPTY_STRING; |
| } |
| |
| /** |
| * Estimates the directionality of a string using the default text direction heuristic. |
| * |
| * @param str String whose directionality is to be estimated. |
| * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns |
| * false. |
| */ |
| public boolean isRtl(String str) { |
| return isRtl((CharSequence) str); |
| } |
| |
| /** |
| * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string |
| * |
| * @param str CharSequence whose directionality is to be estimated. |
| * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns |
| * false. |
| */ |
| public boolean isRtl(CharSequence str) { |
| return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length()); |
| } |
| |
| /** |
| * Formats a string of given directionality for use in plain-text output of the context |
| * directionality, so an opposite-directionality string is neither garbled nor garbles its |
| * surroundings. This makes use of Unicode bidi formatting characters. |
| * <p> |
| * The algorithm: In case the given directionality doesn't match the context directionality, wraps |
| * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or |
| * LRE+{@code str}+PDF for LTR text. |
| * <p> |
| * If {@code isolate}, directionally isolates the string so that it does not garble its |
| * surroundings. Currently, this is done by "resetting" the directionality after the string by |
| * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when |
| * either the overall directionality or the exit directionality of the string is opposite to |
| * that of the context. Unless the formatter was built using |
| * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode |
| * bidi mark matching the context directionality when either the overall directionality or the |
| * entry directionality of the string is opposite to that of the context. Note that as opposed |
| * to the overall directionality, the entry and exit directionalities are determined from the |
| * string itself. |
| * <p> |
| * Does *not* do HTML-escaping. |
| * |
| * @param str The input string. |
| * @param heuristic The algorithm to be used to estimate the string's overall direction. |
| * See {@link TextDirectionHeuristics} for pre-defined heuristics. |
| * @param isolate Whether to directionally isolate the string to prevent it from garbling the |
| * content around it |
| * @return Input string after applying the above processing. {@code null} if {@code str} is |
| * {@code null}. |
| */ |
| public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic, |
| boolean isolate) { |
| if (str == null) return null; |
| return unicodeWrap((CharSequence) str, heuristic, isolate).toString(); |
| } |
| |
| /** |
| * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a |
| * CharSequence instead of a string |
| * |
| * @param str The input CharSequence. |
| * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. |
| * See {@link TextDirectionHeuristics} for pre-defined heuristics. |
| * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling |
| * the content around it |
| * @return Input CharSequence after applying the above processing. {@code null} if {@code str} |
| * is {@code null}. |
| */ |
| public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str, |
| TextDirectionHeuristic heuristic, boolean isolate) { |
| if (str == null) return null; |
| final boolean isRtl = heuristic.isRtl(str, 0, str.length()); |
| SpannableStringBuilder result = new SpannableStringBuilder(); |
| if (getStereoReset() && isolate) { |
| result.append(markBefore(str, |
| isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); |
| } |
| if (isRtl != mIsRtlContext) { |
| result.append(isRtl ? RLE : LRE); |
| result.append(str); |
| result.append(PDF); |
| } else { |
| result.append(str); |
| } |
| if (isolate) { |
| result.append(markAfter(str, |
| isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); |
| } |
| return result; |
| } |
| |
| /** |
| * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes |
| * {@code isolate} is true. |
| * |
| * @param str The input string. |
| * @param heuristic The algorithm to be used to estimate the string's overall direction. |
| * See {@link TextDirectionHeuristics} for pre-defined heuristics. |
| * @return Input string after applying the above processing. |
| */ |
| public String unicodeWrap(String str, TextDirectionHeuristic heuristic) { |
| return unicodeWrap(str, heuristic, true /* isolate */); |
| } |
| |
| /** |
| * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but |
| * assumes {@code isolate} is true. |
| * |
| * @param str The input CharSequence. |
| * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. |
| * See {@link TextDirectionHeuristics} for pre-defined heuristics. |
| * @return Input CharSequence after applying the above processing. |
| */ |
| public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) { |
| return unicodeWrap(str, heuristic, true /* isolate */); |
| } |
| |
| |
| /** |
| * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the |
| * formatter's default direction estimation algorithm. |
| * |
| * @param str The input string. |
| * @param isolate Whether to directionally isolate the string to prevent it from garbling the |
| * content around it |
| * @return Input string after applying the above processing. |
| */ |
| public String unicodeWrap(String str, boolean isolate) { |
| return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); |
| } |
| |
| /** |
| * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses |
| * the formatter's default direction estimation algorithm. |
| * |
| * @param str The input CharSequence. |
| * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling |
| * the content around it |
| * @return Input CharSequence after applying the above processing. |
| */ |
| public CharSequence unicodeWrap(CharSequence str, boolean isolate) { |
| return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); |
| } |
| |
| /** |
| * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the |
| * formatter's default direction estimation algorithm and assumes {@code isolate} is true. |
| * |
| * @param str The input string. |
| * @return Input string after applying the above processing. |
| */ |
| public String unicodeWrap(String str) { |
| return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); |
| } |
| |
| /** |
| * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses |
| * the formatter's default direction estimation algorithm and assumes {@code isolate} is true. |
| * |
| * @param str The input CharSequence. |
| * @return Input CharSequence after applying the above processing. |
| */ |
| public CharSequence unicodeWrap(CharSequence str) { |
| return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); |
| } |
| |
| private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { |
| return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; |
| } |
| |
| /** |
| * Helper method to return true if the Locale directionality is RTL. |
| * |
| * @param locale The Locale whose directionality will be checked to be RTL or LTR |
| * @return true if the {@code locale} directionality is RTL. False otherwise. |
| */ |
| private static boolean isRtlLocale(Locale locale) { |
| return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL); |
| } |
| |
| /** |
| * Enum for directionality type. |
| */ |
| private static final int DIR_LTR = -1; |
| private static final int DIR_UNKNOWN = 0; |
| private static final int DIR_RTL = +1; |
| |
| /** |
| * Returns the directionality of the last character with strong directionality in the string, or |
| * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of |
| * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a |
| * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a |
| * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check |
| * whether a logically separate item that starts with a number or a character of the string's |
| * exit directionality and follows this string inline (not counting any neutral characters in |
| * between) would "stick" to it in an opposite-directionality context, thus being displayed in |
| * an incorrect position. An LRM or RLM character (the one of the context's directionality) |
| * between the two will prevent such sticking. |
| * |
| * @param str the string to check. |
| */ |
| private static int getExitDir(CharSequence str) { |
| return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); |
| } |
| |
| /** |
| * Returns the directionality of the first character with strong directionality in the string, |
| * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an |
| * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after |
| * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF |
| * characters. The intended use is to check whether a logically separate item that ends with a |
| * character of the string's entry directionality and precedes the string inline (not counting |
| * any neutral characters in between) would "stick" to it in an opposite-directionality context, |
| * thus being displayed in an incorrect position. An LRM or RLM character (the one of the |
| * context's directionality) between the two will prevent such sticking. |
| * |
| * @param str the string to check. |
| */ |
| private static int getEntryDir(CharSequence str) { |
| return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); |
| } |
| |
| /** |
| * An object that estimates the directionality of a given string by various methods. |
| * |
| * @hide |
| */ |
| @VisibleForTesting |
| public static class DirectionalityEstimator { |
| |
| // Internal static variables and constants. |
| |
| /** |
| * Size of the bidi character class cache. The results of the Character.getDirectionality() |
| * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. |
| * The 0x700 value is designed to leave all the European and Near Eastern languages in the |
| * cache. It can be reduced to 0x180, restricting the cache to the Western European |
| * languages. |
| */ |
| private static final int DIR_TYPE_CACHE_SIZE = 0x700; |
| |
| /** |
| * The bidi character class cache. |
| */ |
| private static final byte DIR_TYPE_CACHE[]; |
| |
| static { |
| DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; |
| for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { |
| // Calling Character.getDirectionality() is OK here, since new emojis start after |
| // the end of our cache. |
| DIR_TYPE_CACHE[i] = Character.getDirectionality(i); |
| } |
| } |
| |
| /** |
| * Return Character directionality. Same as {@link Character#getDirectionality(int)} except |
| * it can override values for newest emoji that are not covered by ICU. |
| */ |
| public static byte getDirectionality(int codePoint) { |
| return Character.getDirectionality(codePoint); |
| } |
| |
| // Internal instance variables. |
| |
| /** |
| * The text to be scanned. |
| */ |
| private final CharSequence text; |
| |
| /** |
| * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and |
| * entities when looking for the next / preceding dir type. |
| */ |
| private final boolean isHtml; |
| |
| /** |
| * The length of the text in chars. |
| */ |
| private final int length; |
| |
| /** |
| * The current position in the text. |
| */ |
| private int charIndex; |
| |
| /** |
| * The char encountered by the last dirTypeForward or dirTypeBackward call. If it |
| * encountered a supplementary codepoint, this contains a char that is not a valid |
| * codepoint. This is ok, because this member is only used to detect some well-known ASCII |
| * syntax, e.g. "http://" and the beginning of an HTML tag or entity. |
| */ |
| private char lastChar; |
| |
| /** |
| * Constructor. |
| * |
| * @param text The string to scan. |
| * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over |
| * tags and entities. |
| */ |
| DirectionalityEstimator(CharSequence text, boolean isHtml) { |
| this.text = text; |
| this.isHtml = isHtml; |
| length = text.length(); |
| } |
| |
| /** |
| * Returns the directionality of the first character with strong directionality in the |
| * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an |
| * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL |
| * after RLE/RLO. The results are undefined for a string containing unbalanced |
| * LRE/RLE/LRO/RLO/PDF characters. |
| */ |
| int getEntryDir() { |
| // The reason for this method name, as opposed to getFirstStrongDir(), is that |
| // "first strong" is a commonly used description of Unicode's estimation algorithm, |
| // but the two must treat formatting characters quite differently. Thus, we are staying |
| // away from both "first" and "last" in these method names to avoid confusion. |
| charIndex = 0; |
| int embeddingLevel = 0; |
| int embeddingLevelDir = DIR_UNKNOWN; |
| int firstNonEmptyEmbeddingLevel = 0; |
| while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { |
| switch (dirTypeForward()) { |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: |
| ++embeddingLevel; |
| embeddingLevelDir = DIR_LTR; |
| break; |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: |
| ++embeddingLevel; |
| embeddingLevelDir = DIR_RTL; |
| break; |
| case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: |
| --embeddingLevel; |
| // To restore embeddingLevelDir to its previous value, we would need a |
| // stack, which we want to avoid. Thus, at this point we do not know the |
| // current embedding's directionality. |
| embeddingLevelDir = DIR_UNKNOWN; |
| break; |
| case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: |
| break; |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT: |
| if (embeddingLevel == 0) { |
| return DIR_LTR; |
| } |
| firstNonEmptyEmbeddingLevel = embeddingLevel; |
| break; |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT: |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: |
| if (embeddingLevel == 0) { |
| return DIR_RTL; |
| } |
| firstNonEmptyEmbeddingLevel = embeddingLevel; |
| break; |
| default: |
| firstNonEmptyEmbeddingLevel = embeddingLevel; |
| break; |
| } |
| } |
| |
| // We have either found a non-empty embedding or scanned the entire string finding |
| // neither a non-empty embedding nor a strong character outside of an embedding. |
| if (firstNonEmptyEmbeddingLevel == 0) { |
| // We have not found a non-empty embedding. Thus, the string contains neither a |
| // non-empty embedding nor a strong character outside of an embedding. |
| return DIR_UNKNOWN; |
| } |
| |
| // We have found a non-empty embedding. |
| if (embeddingLevelDir != DIR_UNKNOWN) { |
| // We know the directionality of the non-empty embedding. |
| return embeddingLevelDir; |
| } |
| |
| // We do not remember the directionality of the non-empty embedding we found. So, we go |
| // backwards to find the start of the non-empty embedding and get its directionality. |
| while (charIndex > 0) { |
| switch (dirTypeBackward()) { |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: |
| if (firstNonEmptyEmbeddingLevel == embeddingLevel) { |
| return DIR_LTR; |
| } |
| --embeddingLevel; |
| break; |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: |
| if (firstNonEmptyEmbeddingLevel == embeddingLevel) { |
| return DIR_RTL; |
| } |
| --embeddingLevel; |
| break; |
| case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: |
| ++embeddingLevel; |
| break; |
| } |
| } |
| // We should never get here. |
| return DIR_UNKNOWN; |
| } |
| |
| /** |
| * Returns the directionality of the last character with strong directionality in the |
| * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards |
| * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its |
| * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results |
| * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. |
| */ |
| int getExitDir() { |
| // The reason for this method name, as opposed to getLastStrongDir(), is that "last |
| // strong" sounds like the exact opposite of "first strong", which is a commonly used |
| // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two |
| // must treat formatting characters quite differently. Thus, we are staying away from |
| // both "first" and "last" in these method names to avoid confusion. |
| charIndex = length; |
| int embeddingLevel = 0; |
| int lastNonEmptyEmbeddingLevel = 0; |
| while (charIndex > 0) { |
| switch (dirTypeBackward()) { |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT: |
| if (embeddingLevel == 0) { |
| return DIR_LTR; |
| } |
| if (lastNonEmptyEmbeddingLevel == 0) { |
| lastNonEmptyEmbeddingLevel = embeddingLevel; |
| } |
| break; |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: |
| case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: |
| if (lastNonEmptyEmbeddingLevel == embeddingLevel) { |
| return DIR_LTR; |
| } |
| --embeddingLevel; |
| break; |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT: |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: |
| if (embeddingLevel == 0) { |
| return DIR_RTL; |
| } |
| if (lastNonEmptyEmbeddingLevel == 0) { |
| lastNonEmptyEmbeddingLevel = embeddingLevel; |
| } |
| break; |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: |
| case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: |
| if (lastNonEmptyEmbeddingLevel == embeddingLevel) { |
| return DIR_RTL; |
| } |
| --embeddingLevel; |
| break; |
| case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: |
| ++embeddingLevel; |
| break; |
| case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: |
| break; |
| default: |
| if (lastNonEmptyEmbeddingLevel == 0) { |
| lastNonEmptyEmbeddingLevel = embeddingLevel; |
| } |
| break; |
| } |
| } |
| return DIR_UNKNOWN; |
| } |
| |
| // Internal methods |
| |
| /** |
| * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using |
| * a cache for speed. Not designed for supplementary codepoints, whose results we do not |
| * cache. |
| */ |
| private static byte getCachedDirectionality(char c) { |
| return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c); |
| } |
| |
| /** |
| * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances |
| * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, |
| * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to |
| * figure out the actual character, and return its dirtype, but treating it as whitespace is |
| * good enough for our purposes. |
| * |
| * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. |
| */ |
| byte dirTypeForward() { |
| lastChar = text.charAt(charIndex); |
| if (Character.isHighSurrogate(lastChar)) { |
| int codePoint = Character.codePointAt(text, charIndex); |
| charIndex += Character.charCount(codePoint); |
| return getDirectionality(codePoint); |
| } |
| charIndex++; |
| byte dirType = getCachedDirectionality(lastChar); |
| if (isHtml) { |
| // Process tags and entities. |
| if (lastChar == '<') { |
| dirType = skipTagForward(); |
| } else if (lastChar == '&') { |
| dirType = skipEntityForward(); |
| } |
| } |
| return dirType; |
| } |
| |
| /** |
| * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances |
| * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or |
| * entity, advances over the whole tag/entity and returns |
| * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the |
| * actual character, and return its dirtype, but treating it as whitespace is good enough |
| * for our purposes. |
| * |
| * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. |
| */ |
| byte dirTypeBackward() { |
| lastChar = text.charAt(charIndex - 1); |
| if (Character.isLowSurrogate(lastChar)) { |
| int codePoint = Character.codePointBefore(text, charIndex); |
| charIndex -= Character.charCount(codePoint); |
| return getDirectionality(codePoint); |
| } |
| charIndex--; |
| byte dirType = getCachedDirectionality(lastChar); |
| if (isHtml) { |
| // Process tags and entities. |
| if (lastChar == '>') { |
| dirType = skipTagBackward(); |
| } else if (lastChar == ';') { |
| dirType = skipEntityBackward(); |
| } |
| } |
| return dirType; |
| } |
| |
| /** |
| * Advances charIndex forward through an HTML tag (after the opening < has already been |
| * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, |
| * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the |
| * < that hadn't been part of a tag after all). |
| */ |
| private byte skipTagForward() { |
| int initialCharIndex = charIndex; |
| while (charIndex < length) { |
| lastChar = text.charAt(charIndex++); |
| if (lastChar == '>') { |
| // The end of the tag. |
| return Character.DIRECTIONALITY_WHITESPACE; |
| } |
| if (lastChar == '"' || lastChar == '\'') { |
| // Skip over a quoted attribute value inside the tag. |
| char quote = lastChar; |
| while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} |
| } |
| } |
| // The original '<' wasn't the start of a tag after all. |
| charIndex = initialCharIndex; |
| lastChar = '<'; |
| return Character.DIRECTIONALITY_OTHER_NEUTRALS; |
| } |
| |
| /** |
| * Advances charIndex backward through an HTML tag (after the closing > has already been |
| * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does |
| * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > |
| * that hadn't been part of a tag after all). Nevertheless, the running time for calling |
| * skipTagBackward() in a loop remains linear in the size of the text, even for a text like |
| * ">>>>", because skipTagBackward() also stops looking for a matching < |
| * when it encounters another >. |
| */ |
| private byte skipTagBackward() { |
| int initialCharIndex = charIndex; |
| while (charIndex > 0) { |
| lastChar = text.charAt(--charIndex); |
| if (lastChar == '<') { |
| // The start of the tag. |
| return Character.DIRECTIONALITY_WHITESPACE; |
| } |
| if (lastChar == '>') { |
| break; |
| } |
| if (lastChar == '"' || lastChar == '\'') { |
| // Skip over a quoted attribute value inside the tag. |
| char quote = lastChar; |
| while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} |
| } |
| } |
| // The original '>' wasn't the end of a tag after all. |
| charIndex = initialCharIndex; |
| lastChar = '>'; |
| return Character.DIRECTIONALITY_OTHER_NEUTRALS; |
| } |
| |
| /** |
| * Advances charIndex forward through an HTML character entity tag (after the opening |
| * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be |
| * best to figure out the actual character and return its dirtype, but this is good enough. |
| */ |
| private byte skipEntityForward() { |
| while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} |
| return Character.DIRECTIONALITY_WHITESPACE; |
| } |
| |
| /** |
| * Advances charIndex backward through an HTML character entity tag (after the closing ; |
| * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best |
| * to figure out the actual character and return its dirtype, but this is good enough. |
| * If there is no matching &, does not change charIndex and returns |
| * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after |
| * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains |
| * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() |
| * also stops looking for a matching & when it encounters another ;. |
| */ |
| private byte skipEntityBackward() { |
| int initialCharIndex = charIndex; |
| while (charIndex > 0) { |
| lastChar = text.charAt(--charIndex); |
| if (lastChar == '&') { |
| return Character.DIRECTIONALITY_WHITESPACE; |
| } |
| if (lastChar == ';') { |
| break; |
| } |
| } |
| charIndex = initialCharIndex; |
| lastChar = ';'; |
| return Character.DIRECTIONALITY_OTHER_NEUTRALS; |
| } |
| } |
| } |