1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text;
18
19import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR;
20
21import android.annotation.Nullable;
22import android.view.View;
23
24import com.android.internal.annotations.VisibleForTesting;
25
26import java.util.Locale;
27
28/**
29 * Utility class for formatting text for display in a potentially opposite-directionality context
30 * without garbling. The directionality of the context is set at formatter creation and the
31 * directionality of the text can be either estimated or passed in when known.
32 *
33 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2},
34 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class.
35 *
36 * <p>These APIs provides the following functionality:
37 * <p>
38 * 1. Bidi Wrapping
39 * When text in one language is mixed into a document in another, opposite-directionality language,
40 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string
41 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
42 * separated from the surrounding text in a "wrapper" that:
43 * <p>
44 * - Declares its directionality so that the string is displayed correctly. This can be done in
45 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
46 * <p>
47 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
48 *   Currently, this can only be done using invisible Unicode characters of the same direction as
49 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
50 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
51 *   string. Without "reset" after the string, the string will "stick" to a number or logically
52 *   separate opposite-direction text that happens to follow it in-line (even if separated by
53 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
54 *   happen there, but only with more opposite-direction text, not a number. One approach is to
55 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
56 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
57 *   the "reset" only before each string definitely does not work because we do not want to require
58 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
59 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
60 *   message translations often contain untranslated Latin-script brand names and technical terms,
61 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
62 *   has such a message, it is best to do the "reset" manually in the message translation itself,
63 *   since the message's opposite-direction text could be followed by an inserted number, which we
64 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
65 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
66 *   isolation to be part of the directionality declaration. This form of isolation is better than
67 *   "reset" because it takes less space, does not require knowing the context directionality, has a
68 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
69 *   using it because required platforms do not yet support it.
70 * <p>
71 * Providing these wrapping services is the basic purpose of the bidi formatter.
72 * <p>
73 * 2. Directionality estimation
74 * How does one know whether a string about to be inserted into surrounding text has the same
75 * directionality? Well, in many cases, one knows that this must be the case when writing the code
76 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
77 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
78 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
79 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
80 * language of the string (and thus its directionality) is not known a priori, and must be
81 * estimated at run-time. The bidi formatter can do this automatically using the default
82 * first-strong estimation algorithm. It can also be configured to use a custom directionality
83 * estimation object.
84 */
85public final class BidiFormatter {
86
87    /**
88     * The default text direction heuristic.
89     */
90    private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
91
92    /**
93     * Unicode "Left-To-Right Embedding" (LRE) character.
94     */
95    private static final char LRE = '\u202A';
96
97    /**
98     * Unicode "Right-To-Left Embedding" (RLE) character.
99     */
100    private static final char RLE = '\u202B';
101
102    /**
103     * Unicode "Pop Directional Formatting" (PDF) character.
104     */
105    private static final char PDF = '\u202C';
106
107    /**
108     *  Unicode "Left-To-Right Mark" (LRM) character.
109     */
110    private static final char LRM = '\u200E';
111
112    /*
113     * Unicode "Right-To-Left Mark" (RLM) character.
114     */
115    private static final char RLM = '\u200F';
116
117    /*
118     * String representation of LRM
119     */
120    private static final String LRM_STRING = Character.toString(LRM);
121
122    /*
123     * String representation of RLM
124     */
125    private static final String RLM_STRING = Character.toString(RLM);
126
127    /**
128     * Empty string constant.
129     */
130    private static final String EMPTY_STRING = "";
131
132    /**
133     * A class for building a BidiFormatter with non-default options.
134     */
135    public static final class Builder {
136        private boolean mIsRtlContext;
137        private int mFlags;
138        private TextDirectionHeuristic mTextDirectionHeuristic;
139
140        /**
141         * Constructor.
142         *
143         */
144        public Builder() {
145            initialize(isRtlLocale(Locale.getDefault()));
146        }
147
148        /**
149         * Constructor.
150         *
151         * @param rtlContext Whether the context directionality is RTL.
152         */
153        public Builder(boolean rtlContext) {
154            initialize(rtlContext);
155        }
156
157        /**
158         * Constructor.
159         *
160         * @param locale The context locale.
161         */
162        public Builder(Locale locale) {
163            initialize(isRtlLocale(locale));
164        }
165
166        /**
167         * Initializes the builder with the given context directionality and default options.
168         *
169         * @param isRtlContext Whether the context is RTL or not.
170         */
171        private void initialize(boolean isRtlContext) {
172            mIsRtlContext = isRtlContext;
173            mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC;
174            mFlags = DEFAULT_FLAGS;
175        }
176
177        /**
178         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
179         * a string being bidi-wrapped, not just after it. The default is true.
180         */
181        public Builder stereoReset(boolean stereoReset) {
182            if (stereoReset) {
183                mFlags |= FLAG_STEREO_RESET;
184            } else {
185                mFlags &= ~FLAG_STEREO_RESET;
186            }
187            return this;
188        }
189
190        /**
191         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
192         * By default, uses the first-strong heuristic.
193         *
194         * @param heuristic the {@code TextDirectionHeuristic} to use.
195         * @return the builder itself.
196         */
197        public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) {
198            mTextDirectionHeuristic = heuristic;
199            return this;
200        }
201
202        /**
203         * @return A BidiFormatter with the specified options.
204         */
205        public BidiFormatter build() {
206            if (mFlags == DEFAULT_FLAGS &&
207                    mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
208                return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext);
209            }
210            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic);
211        }
212    }
213
214    //
215    private static final int FLAG_STEREO_RESET = 2;
216    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
217
218    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
219            false /* LTR context */,
220            DEFAULT_FLAGS,
221            DEFAULT_TEXT_DIRECTION_HEURISTIC);
222
223    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
224            true /* RTL context */,
225            DEFAULT_FLAGS,
226            DEFAULT_TEXT_DIRECTION_HEURISTIC);
227
228    private final boolean mIsRtlContext;
229    private final int mFlags;
230    private final TextDirectionHeuristic mDefaultTextDirectionHeuristic;
231
232    /**
233     * Factory for creating an instance of BidiFormatter for the default locale directionality.
234     *
235     * This does not create any new objects, and returns already existing static instances.
236     *
237     */
238    public static BidiFormatter getInstance() {
239        return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault()));
240    }
241
242    /**
243     * Factory for creating an instance of BidiFormatter given the context directionality.
244     *
245     * This does not create any new objects, and returns already existing static instances.
246     *
247     * @param rtlContext Whether the context directionality is RTL.
248     */
249    public static BidiFormatter getInstance(boolean rtlContext) {
250        return getDefaultInstanceFromContext(rtlContext);
251    }
252
253    /**
254     * Factory for creating an instance of BidiFormatter given the context locale.
255     *
256     * This does not create any new objects, and returns already existing static instances.
257     *
258     * @param locale The context locale.
259     */
260    public static BidiFormatter getInstance(Locale locale) {
261        return getDefaultInstanceFromContext(isRtlLocale(locale));
262    }
263
264    /**
265     * @param isRtlContext Whether the context directionality is RTL or not.
266     * @param flags The option flags.
267     * @param heuristic The default text direction heuristic.
268     */
269    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) {
270        mIsRtlContext = isRtlContext;
271        mFlags = flags;
272        mDefaultTextDirectionHeuristic = heuristic;
273    }
274
275    /**
276     * @return Whether the context directionality is RTL
277     */
278    public boolean isRtlContext() {
279        return mIsRtlContext;
280    }
281
282    /**
283     * @return Whether directionality "reset" should also be done before a string being
284     * bidi-wrapped, not just after it.
285     */
286    public boolean getStereoReset() {
287        return (mFlags & FLAG_STEREO_RESET) != 0;
288    }
289
290    /**
291     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
292     * overall or the exit directionality of a given string is opposite to the context directionality.
293     * Putting this after the string (including its directionality declaration wrapping) prevents it
294     * from "sticking" to other opposite-directionality text or a number appearing after it inline
295     * with only neutral content in between. Otherwise returns the empty string. While the exit
296     * directionality is determined by scanning the end of the string, the overall directionality is
297     * given explicitly by a heuristic to estimate the {@code str}'s directionality.
298     *
299     * @param str CharSequence after which the mark may need to appear.
300     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
301     *                  directionality.
302     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
303     *     else, the empty string.
304     *
305     * @hide
306     */
307    public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) {
308        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
309        // getExitDir() is called only if needed (short-circuit).
310        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
311            return LRM_STRING;
312        }
313        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
314            return RLM_STRING;
315        }
316        return EMPTY_STRING;
317    }
318
319    /**
320     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
321     * overall or the entry directionality of a given string is opposite to the context
322     * directionality. Putting this before the string (including its directionality declaration
323     * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
324     * it inline with only neutral content in between. Otherwise returns the empty string. While the
325     * entry directionality is determined by scanning the beginning of the string, the overall
326     * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
327     *
328     * @param str CharSequence before which the mark may need to appear.
329     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
330     *                  directionality.
331     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
332     *     else, the empty string.
333     *
334     * @hide
335     */
336    public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) {
337        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
338        // getEntryDir() is called only if needed (short-circuit).
339        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
340            return LRM_STRING;
341        }
342        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
343            return RLM_STRING;
344        }
345        return EMPTY_STRING;
346    }
347
348    /**
349     * Estimates the directionality of a string using the default text direction heuristic.
350     *
351     * @param str String whose directionality is to be estimated.
352     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
353     *          false.
354     */
355    public boolean isRtl(String str) {
356        return isRtl((CharSequence) str);
357    }
358
359    /**
360     * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string
361     *
362     * @param str CharSequence whose directionality is to be estimated.
363     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
364     *          false.
365     */
366    public boolean isRtl(CharSequence str) {
367        return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length());
368    }
369
370    /**
371     * Formats a string of given directionality for use in plain-text output of the context
372     * directionality, so an opposite-directionality string is neither garbled nor garbles its
373     * surroundings. This makes use of Unicode bidi formatting characters.
374     * <p>
375     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
376     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
377     * LRE+{@code str}+PDF for LTR text.
378     * <p>
379     * If {@code isolate}, directionally isolates the string so that it does not garble its
380     * surroundings. Currently, this is done by "resetting" the directionality after the string by
381     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
382     * either the overall directionality or the exit directionality of the string is opposite to
383     * that of the context. Unless the formatter was built using
384     * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
385     * bidi mark matching the context directionality when either the overall directionality or the
386     * entry directionality of the string is opposite to that of the context. Note that as opposed
387     * to the overall directionality, the entry and exit directionalities are determined from the
388     * string itself.
389     * <p>
390     * Does *not* do HTML-escaping.
391     *
392     * @param str The input string.
393     * @param heuristic The algorithm to be used to estimate the string's overall direction.
394     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
395     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
396     *     content around it
397     * @return Input string after applying the above processing. {@code null} if {@code str} is
398     *     {@code null}.
399     */
400    public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic,
401            boolean isolate) {
402        if (str == null) return null;
403        return unicodeWrap((CharSequence) str, heuristic, isolate).toString();
404    }
405
406    /**
407     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a
408     * CharSequence instead of a string
409     *
410     * @param str The input CharSequence.
411     * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
412     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
413     * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
414     *     the content around it
415     * @return Input CharSequence after applying the above processing. {@code null} if {@code str}
416     *     is {@code null}.
417     */
418    public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str,
419            TextDirectionHeuristic heuristic, boolean isolate) {
420        if (str == null) return null;
421        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
422        SpannableStringBuilder result = new SpannableStringBuilder();
423        if (getStereoReset() && isolate) {
424            result.append(markBefore(str,
425                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
426        }
427        if (isRtl != mIsRtlContext) {
428            result.append(isRtl ? RLE : LRE);
429            result.append(str);
430            result.append(PDF);
431        } else {
432            result.append(str);
433        }
434        if (isolate) {
435            result.append(markAfter(str,
436                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
437        }
438        return result;
439    }
440
441    /**
442     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes
443     * {@code isolate} is true.
444     *
445     * @param str The input string.
446     * @param heuristic The algorithm to be used to estimate the string's overall direction.
447     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
448     * @return Input string after applying the above processing.
449     */
450    public String unicodeWrap(String str, TextDirectionHeuristic heuristic) {
451        return unicodeWrap(str, heuristic, true /* isolate */);
452    }
453
454    /**
455     * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but
456     * assumes {@code isolate} is true.
457     *
458     * @param str The input CharSequence.
459     * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
460     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
461     * @return Input CharSequence after applying the above processing.
462     */
463    public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) {
464        return unicodeWrap(str, heuristic, true /* isolate */);
465    }
466
467
468    /**
469     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
470     * formatter's default direction estimation algorithm.
471     *
472     * @param str The input string.
473     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
474     *     content around it
475     * @return Input string after applying the above processing.
476     */
477    public String unicodeWrap(String str, boolean isolate) {
478        return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
479    }
480
481    /**
482     * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
483     * the formatter's default direction estimation algorithm.
484     *
485     * @param str The input CharSequence.
486     * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
487     *     the content around it
488     * @return Input CharSequence after applying the above processing.
489     */
490    public CharSequence unicodeWrap(CharSequence str, boolean isolate) {
491        return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
492    }
493
494    /**
495     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
496     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
497     *
498     * @param str The input string.
499     * @return Input string after applying the above processing.
500     */
501    public String unicodeWrap(String str) {
502        return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
503    }
504
505    /**
506     * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
507     * the formatter's default direction estimation algorithm and assumes {@code isolate} is true.
508     *
509     * @param str The input CharSequence.
510     * @return Input CharSequence after applying the above processing.
511     */
512    public CharSequence unicodeWrap(CharSequence str) {
513        return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
514    }
515
516    private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
517        return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
518    }
519
520    /**
521     * Helper method to return true if the Locale directionality is RTL.
522     *
523     * @param locale The Locale whose directionality will be checked to be RTL or LTR
524     * @return true if the {@code locale} directionality is RTL. False otherwise.
525     */
526    private static boolean isRtlLocale(Locale locale) {
527        return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL);
528    }
529
530    /**
531     * Enum for directionality type.
532     */
533    private static final int DIR_LTR = -1;
534    private static final int DIR_UNKNOWN = 0;
535    private static final int DIR_RTL = +1;
536
537    /**
538     * Returns the directionality of the last character with strong directionality in the string, or
539     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
540     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
541     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
542     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
543     * whether a logically separate item that starts with a number or a character of the string's
544     * exit directionality and follows this string inline (not counting any neutral characters in
545     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
546     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
547     * between the two will prevent such sticking.
548     *
549     * @param str the string to check.
550     */
551    private static int getExitDir(CharSequence str) {
552        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
553    }
554
555    /**
556     * Returns the directionality of the first character with strong directionality in the string,
557     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
558     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
559     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
560     * characters. The intended use is to check whether a logically separate item that ends with a
561     * character of the string's entry directionality and precedes the string inline (not counting
562     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
563     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
564     * context's directionality) between the two will prevent such sticking.
565     *
566     * @param str the string to check.
567     */
568    private static int getEntryDir(CharSequence str) {
569        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
570    }
571
572    /**
573     * An object that estimates the directionality of a given string by various methods.
574     *
575     * @hide
576     */
577    @VisibleForTesting
578    public static class DirectionalityEstimator {
579
580        // Internal static variables and constants.
581
582        /**
583         * Size of the bidi character class cache. The results of the Character.getDirectionality()
584         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
585         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
586         * cache. It can be reduced to 0x180, restricting the cache to the Western European
587         * languages.
588         */
589        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
590
591        /**
592         * The bidi character class cache.
593         */
594        private static final byte DIR_TYPE_CACHE[];
595
596        static {
597            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
598            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
599                // Calling Character.getDirectionality() is OK here, since new emojis start after
600                // the end of our cache.
601                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
602            }
603        }
604
605        /**
606         * Return Character directionality. Same as {@link Character#getDirectionality(int)} except
607         * it overrides values for newest emoji that are not covered by ICU.
608         */
609        public static byte getDirectionality(int codePoint) {
610            if (Emoji.isNewEmoji(codePoint)) {
611                // TODO: Fix or remove once emoji-data.text 5.0 is in ICU or update to 6.0.
612                return Character.DIRECTIONALITY_OTHER_NEUTRALS;
613            } else {
614                return Character.getDirectionality(codePoint);
615            }
616        }
617
618        // Internal instance variables.
619
620        /**
621         * The text to be scanned.
622         */
623        private final CharSequence text;
624
625        /**
626         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
627         * entities when looking for the next / preceding dir type.
628         */
629        private final boolean isHtml;
630
631        /**
632         * The length of the text in chars.
633         */
634        private final int length;
635
636        /**
637         * The current position in the text.
638         */
639        private int charIndex;
640
641        /**
642         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
643         * encountered a supplementary codepoint, this contains a char that is not a valid
644         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
645         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
646         */
647        private char lastChar;
648
649        /**
650         * Constructor.
651         *
652         * @param text The string to scan.
653         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
654         *     tags and entities.
655         */
656        DirectionalityEstimator(CharSequence text, boolean isHtml) {
657            this.text = text;
658            this.isHtml = isHtml;
659            length = text.length();
660        }
661
662        /**
663         * Returns the directionality of the first character with strong directionality in the
664         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
665         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
666         * after RLE/RLO. The results are undefined for a string containing unbalanced
667         * LRE/RLE/LRO/RLO/PDF characters.
668         */
669        int getEntryDir() {
670            // The reason for this method name, as opposed to getFirstStrongDir(), is that
671            // "first strong" is a commonly used description of Unicode's estimation algorithm,
672            // but the two must treat formatting characters quite differently. Thus, we are staying
673            // away from both "first" and "last" in these method names to avoid confusion.
674            charIndex = 0;
675            int embeddingLevel = 0;
676            int embeddingLevelDir = DIR_UNKNOWN;
677            int firstNonEmptyEmbeddingLevel = 0;
678            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
679                switch (dirTypeForward()) {
680                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
681                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
682                        ++embeddingLevel;
683                        embeddingLevelDir = DIR_LTR;
684                        break;
685                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
686                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
687                        ++embeddingLevel;
688                        embeddingLevelDir = DIR_RTL;
689                        break;
690                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
691                        --embeddingLevel;
692                        // To restore embeddingLevelDir to its previous value, we would need a
693                        // stack, which we want to avoid. Thus, at this point we do not know the
694                        // current embedding's directionality.
695                        embeddingLevelDir = DIR_UNKNOWN;
696                        break;
697                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
698                        break;
699                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
700                        if (embeddingLevel == 0) {
701                            return DIR_LTR;
702                        }
703                        firstNonEmptyEmbeddingLevel = embeddingLevel;
704                        break;
705                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
706                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
707                        if (embeddingLevel == 0) {
708                            return DIR_RTL;
709                        }
710                        firstNonEmptyEmbeddingLevel = embeddingLevel;
711                        break;
712                    default:
713                        firstNonEmptyEmbeddingLevel = embeddingLevel;
714                        break;
715                }
716            }
717
718            // We have either found a non-empty embedding or scanned the entire string finding
719            // neither a non-empty embedding nor a strong character outside of an embedding.
720            if (firstNonEmptyEmbeddingLevel == 0) {
721                // We have not found a non-empty embedding. Thus, the string contains neither a
722                // non-empty embedding nor a strong character outside of an embedding.
723                return DIR_UNKNOWN;
724            }
725
726            // We have found a non-empty embedding.
727            if (embeddingLevelDir != DIR_UNKNOWN) {
728                // We know the directionality of the non-empty embedding.
729                return embeddingLevelDir;
730            }
731
732            // We do not remember the directionality of the non-empty embedding we found. So, we go
733            // backwards to find the start of the non-empty embedding and get its directionality.
734            while (charIndex > 0) {
735                switch (dirTypeBackward()) {
736                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
737                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
738                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
739                            return DIR_LTR;
740                        }
741                        --embeddingLevel;
742                        break;
743                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
744                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
745                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
746                            return DIR_RTL;
747                        }
748                        --embeddingLevel;
749                        break;
750                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
751                        ++embeddingLevel;
752                        break;
753                }
754            }
755            // We should never get here.
756            return DIR_UNKNOWN;
757        }
758
759        /**
760         * Returns the directionality of the last character with strong directionality in the
761         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
762         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
763         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
764         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
765         */
766        int getExitDir() {
767            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
768            // strong" sounds like the exact opposite of "first strong", which is a commonly used
769            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
770            // must treat formatting characters quite differently. Thus, we are staying away from
771            // both "first" and "last" in these method names to avoid confusion.
772            charIndex = length;
773            int embeddingLevel = 0;
774            int lastNonEmptyEmbeddingLevel = 0;
775            while (charIndex > 0) {
776                switch (dirTypeBackward()) {
777                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
778                        if (embeddingLevel == 0) {
779                            return DIR_LTR;
780                        }
781                        if (lastNonEmptyEmbeddingLevel == 0) {
782                            lastNonEmptyEmbeddingLevel = embeddingLevel;
783                        }
784                        break;
785                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
786                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
787                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
788                            return DIR_LTR;
789                        }
790                        --embeddingLevel;
791                        break;
792                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
793                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
794                        if (embeddingLevel == 0) {
795                            return DIR_RTL;
796                        }
797                        if (lastNonEmptyEmbeddingLevel == 0) {
798                            lastNonEmptyEmbeddingLevel = embeddingLevel;
799                        }
800                        break;
801                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
802                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
803                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
804                            return DIR_RTL;
805                        }
806                        --embeddingLevel;
807                        break;
808                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
809                        ++embeddingLevel;
810                        break;
811                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
812                        break;
813                    default:
814                        if (lastNonEmptyEmbeddingLevel == 0) {
815                            lastNonEmptyEmbeddingLevel = embeddingLevel;
816                        }
817                        break;
818                }
819            }
820            return DIR_UNKNOWN;
821        }
822
823        // Internal methods
824
825        /**
826         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
827         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
828         * cache.
829         */
830        private static byte getCachedDirectionality(char c) {
831            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c);
832        }
833
834        /**
835         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
836         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
837         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
838         * figure out the actual character, and return its dirtype, but treating it as whitespace is
839         * good enough for our purposes.
840         *
841         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
842         */
843        byte dirTypeForward() {
844            lastChar = text.charAt(charIndex);
845            if (Character.isHighSurrogate(lastChar)) {
846                int codePoint = Character.codePointAt(text, charIndex);
847                charIndex += Character.charCount(codePoint);
848                return getDirectionality(codePoint);
849            }
850            charIndex++;
851            byte dirType = getCachedDirectionality(lastChar);
852            if (isHtml) {
853                // Process tags and entities.
854                if (lastChar == '<') {
855                    dirType = skipTagForward();
856                } else if (lastChar == '&') {
857                    dirType = skipEntityForward();
858                }
859            }
860            return dirType;
861        }
862
863        /**
864         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
865         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
866         * entity, advances over the whole tag/entity and returns
867         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
868         * actual character, and return its dirtype, but treating it as whitespace is good enough
869         * for our purposes.
870         *
871         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
872         */
873        byte dirTypeBackward() {
874            lastChar = text.charAt(charIndex - 1);
875            if (Character.isLowSurrogate(lastChar)) {
876                int codePoint = Character.codePointBefore(text, charIndex);
877                charIndex -= Character.charCount(codePoint);
878                return getDirectionality(codePoint);
879            }
880            charIndex--;
881            byte dirType = getCachedDirectionality(lastChar);
882            if (isHtml) {
883                // Process tags and entities.
884                if (lastChar == '>') {
885                    dirType = skipTagBackward();
886                } else if (lastChar == ';') {
887                    dirType = skipEntityBackward();
888                }
889            }
890            return dirType;
891        }
892
893        /**
894         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
895         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
896         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
897         * &lt; that hadn't been part of a tag after all).
898         */
899        private byte skipTagForward() {
900            int initialCharIndex = charIndex;
901            while (charIndex < length) {
902                lastChar = text.charAt(charIndex++);
903                if (lastChar == '>') {
904                    // The end of the tag.
905                    return Character.DIRECTIONALITY_WHITESPACE;
906                }
907                if (lastChar == '"' || lastChar == '\'') {
908                    // Skip over a quoted attribute value inside the tag.
909                    char quote = lastChar;
910                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
911                }
912            }
913            // The original '<' wasn't the start of a tag after all.
914            charIndex = initialCharIndex;
915            lastChar = '<';
916            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
917        }
918
919        /**
920         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
921         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
922         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
923         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
924         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
925         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
926         * when it encounters another &gt;.
927         */
928        private byte skipTagBackward() {
929            int initialCharIndex = charIndex;
930            while (charIndex > 0) {
931                lastChar = text.charAt(--charIndex);
932                if (lastChar == '<') {
933                    // The start of the tag.
934                    return Character.DIRECTIONALITY_WHITESPACE;
935                }
936                if (lastChar == '>') {
937                    break;
938                }
939                if (lastChar == '"' || lastChar == '\'') {
940                    // Skip over a quoted attribute value inside the tag.
941                    char quote = lastChar;
942                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
943                }
944            }
945            // The original '>' wasn't the end of a tag after all.
946            charIndex = initialCharIndex;
947            lastChar = '>';
948            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
949        }
950
951        /**
952         * Advances charIndex forward through an HTML character entity tag (after the opening
953         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
954         * best to figure out the actual character and return its dirtype, but this is good enough.
955         */
956        private byte skipEntityForward() {
957            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
958            return Character.DIRECTIONALITY_WHITESPACE;
959        }
960
961        /**
962         * Advances charIndex backward through an HTML character entity tag (after the closing ;
963         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
964         * to figure out the actual character and return its dirtype, but this is good enough.
965         * If there is no matching &amp;, does not change charIndex and returns
966         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
967         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
968         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
969         * also stops looking for a matching &amp; when it encounters another ;.
970         */
971        private byte skipEntityBackward() {
972            int initialCharIndex = charIndex;
973            while (charIndex > 0) {
974                lastChar = text.charAt(--charIndex);
975                if (lastChar == '&') {
976                    return Character.DIRECTIONALITY_WHITESPACE;
977                }
978                if (lastChar == ';') {
979                    break;
980                }
981            }
982            charIndex = initialCharIndex;
983            lastChar = ';';
984            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
985        }
986    }
987}
988