1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text;
18
19import android.view.View;
20
21import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR;
22
23import java.util.Locale;
24
25/**
26 * Utility class for formatting text for display in a potentially opposite-directionality context
27 * without garbling. The directionality of the context is set at formatter creation and the
28 * directionality of the text can be either estimated or passed in when known.
29 *
30 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2},
31 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class.
32 *
33 * <p>These APIs provides the following functionality:
34 * <p>
35 * 1. Bidi Wrapping
36 * When text in one language is mixed into a document in another, opposite-directionality language,
37 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string
38 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
39 * separated from the surrounding text in a "wrapper" that:
40 * <p>
41 * - Declares its directionality so that the string is displayed correctly. This can be done in
42 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
43 * <p>
44 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
45 *   Currently, this can only be done using invisible Unicode characters of the same direction as
46 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
47 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
48 *   string. Without "reset" after the string, the string will "stick" to a number or logically
49 *   separate opposite-direction text that happens to follow it in-line (even if separated by
50 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
51 *   happen there, but only with more opposite-direction text, not a number. One approach is to
52 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
53 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
54 *   the "reset" only before each string definitely does not work because we do not want to require
55 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
56 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
57 *   message translations often contain untranslated Latin-script brand names and technical terms,
58 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
59 *   has such a message, it is best to do the "reset" manually in the message translation itself,
60 *   since the message's opposite-direction text could be followed by an inserted number, which we
61 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
62 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
63 *   isolation to be part of the directionality declaration. This form of isolation is better than
64 *   "reset" because it takes less space, does not require knowing the context directionality, has a
65 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
66 *   using it because required platforms do not yet support it.
67 * <p>
68 * Providing these wrapping services is the basic purpose of the bidi formatter.
69 * <p>
70 * 2. Directionality estimation
71 * How does one know whether a string about to be inserted into surrounding text has the same
72 * directionality? Well, in many cases, one knows that this must be the case when writing the code
73 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
74 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
75 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
76 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
77 * language of the string (and thus its directionality) is not known a priori, and must be
78 * estimated at run-time. The bidi formatter can do this automatically using the default
79 * first-strong estimation algorithm. It can also be configured to use a custom directionality
80 * estimation object.
81 */
82public final class BidiFormatter {
83
84    /**
85     * The default text direction heuristic.
86     */
87    private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
88
89    /**
90     * Unicode "Left-To-Right Embedding" (LRE) character.
91     */
92    private static final char LRE = '\u202A';
93
94    /**
95     * Unicode "Right-To-Left Embedding" (RLE) character.
96     */
97    private static final char RLE = '\u202B';
98
99    /**
100     * Unicode "Pop Directional Formatting" (PDF) character.
101     */
102    private static final char PDF = '\u202C';
103
104    /**
105     *  Unicode "Left-To-Right Mark" (LRM) character.
106     */
107    private static final char LRM = '\u200E';
108
109    /*
110     * Unicode "Right-To-Left Mark" (RLM) character.
111     */
112    private static final char RLM = '\u200F';
113
114    /*
115     * String representation of LRM
116     */
117    private static final String LRM_STRING = Character.toString(LRM);
118
119    /*
120     * String representation of RLM
121     */
122    private static final String RLM_STRING = Character.toString(RLM);
123
124    /**
125     * Empty string constant.
126     */
127    private static final String EMPTY_STRING = "";
128
129    /**
130     * A class for building a BidiFormatter with non-default options.
131     */
132    public static final class Builder {
133        private boolean mIsRtlContext;
134        private int mFlags;
135        private TextDirectionHeuristic mTextDirectionHeuristic;
136
137        /**
138         * Constructor.
139         *
140         */
141        public Builder() {
142            initialize(isRtlLocale(Locale.getDefault()));
143        }
144
145        /**
146         * Constructor.
147         *
148         * @param rtlContext Whether the context directionality is RTL.
149         */
150        public Builder(boolean rtlContext) {
151            initialize(rtlContext);
152        }
153
154        /**
155         * Constructor.
156         *
157         * @param locale The context locale.
158         */
159        public Builder(Locale locale) {
160            initialize(isRtlLocale(locale));
161        }
162
163        /**
164         * Initializes the builder with the given context directionality and default options.
165         *
166         * @param isRtlContext Whether the context is RTL or not.
167         */
168        private void initialize(boolean isRtlContext) {
169            mIsRtlContext = isRtlContext;
170            mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC;
171            mFlags = DEFAULT_FLAGS;
172        }
173
174        /**
175         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
176         * a string being bidi-wrapped, not just after it. The default is false.
177         */
178        public Builder stereoReset(boolean stereoReset) {
179            if (stereoReset) {
180                mFlags |= FLAG_STEREO_RESET;
181            } else {
182                mFlags &= ~FLAG_STEREO_RESET;
183            }
184            return this;
185        }
186
187        /**
188         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
189         * By default, uses the first-strong heuristic.
190         *
191         * @param heuristic the {@code TextDirectionHeuristic} to use.
192         * @return the builder itself.
193         */
194        public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) {
195            mTextDirectionHeuristic = heuristic;
196            return this;
197        }
198
199        private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
200            return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
201        }
202
203        /**
204         * @return A BidiFormatter with the specified options.
205         */
206        public BidiFormatter build() {
207            if (mFlags == DEFAULT_FLAGS &&
208                    mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
209                return getDefaultInstanceFromContext(mIsRtlContext);
210            }
211            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic);
212        }
213    }
214
215    //
216    private static final int FLAG_STEREO_RESET = 2;
217    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
218
219    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
220            false /* LTR context */,
221            DEFAULT_FLAGS,
222            DEFAULT_TEXT_DIRECTION_HEURISTIC);
223
224    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
225            true /* RTL context */,
226            DEFAULT_FLAGS,
227            DEFAULT_TEXT_DIRECTION_HEURISTIC);
228
229    private final boolean mIsRtlContext;
230    private final int mFlags;
231    private final TextDirectionHeuristic mDefaultTextDirectionHeuristic;
232
233    /**
234     * Factory for creating an instance of BidiFormatter for the default locale directionality.
235     *
236     */
237    public static BidiFormatter getInstance() {
238        return new Builder().build();
239    }
240
241    /**
242     * Factory for creating an instance of BidiFormatter given the context directionality.
243     *
244     * @param rtlContext Whether the context directionality is RTL.
245     */
246    public static BidiFormatter getInstance(boolean rtlContext) {
247        return new Builder(rtlContext).build();
248    }
249
250    /**
251     * Factory for creating an instance of BidiFormatter given the context locale.
252     *
253     * @param locale The context locale.
254     */
255    public static BidiFormatter getInstance(Locale locale) {
256        return new Builder(locale).build();
257    }
258
259    /**
260     * @param isRtlContext Whether the context directionality is RTL or not.
261     * @param flags The option flags.
262     * @param heuristic The default text direction heuristic.
263     */
264    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) {
265        mIsRtlContext = isRtlContext;
266        mFlags = flags;
267        mDefaultTextDirectionHeuristic = heuristic;
268    }
269
270    /**
271     * @return Whether the context directionality is RTL
272     */
273    public boolean isRtlContext() {
274        return mIsRtlContext;
275    }
276
277    /**
278     * @return Whether directionality "reset" should also be done before a string being
279     * bidi-wrapped, not just after it.
280     */
281    public boolean getStereoReset() {
282        return (mFlags & FLAG_STEREO_RESET) != 0;
283    }
284
285    /**
286     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
287     * overall or the exit directionality of a given string is opposite to the context directionality.
288     * Putting this after the string (including its directionality declaration wrapping) prevents it
289     * from "sticking" to other opposite-directionality text or a number appearing after it inline
290     * with only neutral content in between. Otherwise returns the empty string. While the exit
291     * directionality is determined by scanning the end of the string, the overall directionality is
292     * given explicitly by a heuristic to estimate the {@code str}'s directionality.
293     *
294     * @param str String after which the mark may need to appear.
295     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
296     *                  directionality.
297     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
298     *     else, the empty string.
299     *
300     * @hide
301     */
302    public String markAfter(String str, TextDirectionHeuristic heuristic) {
303        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
304        // getExitDir() is called only if needed (short-circuit).
305        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
306            return LRM_STRING;
307        }
308        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
309            return RLM_STRING;
310        }
311        return EMPTY_STRING;
312    }
313
314    /**
315     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
316     * overall or the entry directionality of a given string is opposite to the context
317     * directionality. Putting this before the string (including its directionality declaration
318     * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
319     * it inline with only neutral content in between. Otherwise returns the empty string. While the
320     * entry directionality is determined by scanning the beginning of the string, the overall
321     * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
322     *
323     * @param str String before which the mark may need to appear.
324     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
325     *                  directionality.
326     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
327     *     else, the empty string.
328     *
329     * @hide
330     */
331    public String markBefore(String str, TextDirectionHeuristic heuristic) {
332        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
333        // getEntryDir() is called only if needed (short-circuit).
334        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
335            return LRM_STRING;
336        }
337        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
338            return RLM_STRING;
339        }
340        return EMPTY_STRING;
341    }
342
343    /**
344     * Estimates the directionality of a string using the default text direction heuristic.
345     *
346     * @param str String whose directionality is to be estimated.
347     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
348     *          false.
349     */
350    public boolean isRtl(String str) {
351        return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length());
352    }
353
354    /**
355     * Formats a string of given directionality for use in plain-text output of the context
356     * directionality, so an opposite-directionality string is neither garbled nor garbles its
357     * surroundings. This makes use of Unicode bidi formatting characters.
358     * <p>
359     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
360     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
361     * LRE+{@code str}+PDF for LTR text.
362     * <p>
363     * If {@code isolate}, directionally isolates the string so that it does not garble its
364     * surroundings. Currently, this is done by "resetting" the directionality after the string by
365     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
366     * either the overall directionality or the exit directionality of the string is opposite to that
367     * of the context. If the formatter was built using {@link Builder#stereoReset(boolean)} and
368     * passing "true" as an argument, also prepends a Unicode bidi mark matching the context
369     * directionality when either the overall directionality or the entry directionality of the
370     * string is opposite to that of the context. Note that as opposed to the overall
371     * directionality, the entry and exit directionalities are determined from the string itself.
372     * <p>
373     * Does *not* do HTML-escaping.
374     *
375     * @param str The input string.
376     * @param heuristic The algorithm to be used to estimate the string's overall direction.
377     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
378     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
379     *     content around it
380     * @return Input string after applying the above processing.
381     */
382    public String unicodeWrap(String str, TextDirectionHeuristic heuristic, boolean isolate) {
383        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
384        StringBuilder result = new StringBuilder();
385        if (getStereoReset() && isolate) {
386            result.append(markBefore(str,
387                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
388        }
389        if (isRtl != mIsRtlContext) {
390            result.append(isRtl ? RLE : LRE);
391            result.append(str);
392            result.append(PDF);
393        } else {
394            result.append(str);
395        }
396        if (isolate) {
397            result.append(markAfter(str,
398                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
399        }
400        return result.toString();
401    }
402
403    /**
404     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes
405     * {@code isolate} is true.
406     *
407     * @param str The input string.
408     * @param heuristic The algorithm to be used to estimate the string's overall direction.
409     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
410     * @return Input string after applying the above processing.
411     */
412    public String unicodeWrap(String str, TextDirectionHeuristic heuristic) {
413        return unicodeWrap(str, heuristic, true /* isolate */);
414    }
415
416    /**
417     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
418     * formatter's default direction estimation algorithm.
419     *
420     * @param str The input string.
421     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
422     *     content around it
423     * @return Input string after applying the above processing.
424     */
425    public String unicodeWrap(String str, boolean isolate) {
426        return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
427    }
428
429    /**
430     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
431     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
432     *
433     * @param str The input string.
434     * @return Input string after applying the above processing.
435     */
436    public String unicodeWrap(String str) {
437        return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
438    }
439
440    /**
441     * Helper method to return true if the Locale directionality is RTL.
442     *
443     * @param locale The Locale whose directionality will be checked to be RTL or LTR
444     * @return true if the {@code locale} directionality is RTL. False otherwise.
445     */
446    private static boolean isRtlLocale(Locale locale) {
447        return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL);
448    }
449
450    /**
451     * Enum for directionality type.
452     */
453    private static final int DIR_LTR = -1;
454    private static final int DIR_UNKNOWN = 0;
455    private static final int DIR_RTL = +1;
456
457    /**
458     * Returns the directionality of the last character with strong directionality in the string, or
459     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
460     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
461     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
462     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
463     * whether a logically separate item that starts with a number or a character of the string's
464     * exit directionality and follows this string inline (not counting any neutral characters in
465     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
466     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
467     * between the two will prevent such sticking.
468     *
469     * @param str the string to check.
470     */
471    private static int getExitDir(String str) {
472        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
473    }
474
475    /**
476     * Returns the directionality of the first character with strong directionality in the string,
477     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
478     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
479     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
480     * characters. The intended use is to check whether a logically separate item that ends with a
481     * character of the string's entry directionality and precedes the string inline (not counting
482     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
483     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
484     * context's directionality) between the two will prevent such sticking.
485     *
486     * @param str the string to check.
487     */
488    private static int getEntryDir(String str) {
489        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
490    }
491
492    /**
493     * An object that estimates the directionality of a given string by various methods.
494     *
495     */
496    private static class DirectionalityEstimator {
497
498        // Internal static variables and constants.
499
500        /**
501         * Size of the bidi character class cache. The results of the Character.getDirectionality()
502         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
503         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
504         * cache. It can be reduced to 0x180, restricting the cache to the Western European
505         * languages.
506         */
507        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
508
509        /**
510         * The bidi character class cache.
511         */
512        private static final byte DIR_TYPE_CACHE[];
513
514        static {
515            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
516            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
517                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
518            }
519        }
520
521        // Internal instance variables.
522
523        /**
524         * The text to be scanned.
525         */
526        private final String text;
527
528        /**
529         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
530         * entities when looking for the next / preceding dir type.
531         */
532        private final boolean isHtml;
533
534        /**
535         * The length of the text in chars.
536         */
537        private final int length;
538
539        /**
540         * The current position in the text.
541         */
542        private int charIndex;
543
544        /**
545         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
546         * encountered a supplementary codepoint, this contains a char that is not a valid
547         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
548         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
549         */
550        private char lastChar;
551
552        /**
553         * Constructor.
554         *
555         * @param text The string to scan.
556         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
557         *     tags and entities.
558         */
559        DirectionalityEstimator(String text, boolean isHtml) {
560            this.text = text;
561            this.isHtml = isHtml;
562            length = text.length();
563        }
564
565        /**
566         * Returns the directionality of the first character with strong directionality in the
567         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
568         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
569         * after RLE/RLO. The results are undefined for a string containing unbalanced
570         * LRE/RLE/LRO/RLO/PDF characters.
571         */
572        int getEntryDir() {
573            // The reason for this method name, as opposed to getFirstStrongDir(), is that
574            // "first strong" is a commonly used description of Unicode's estimation algorithm,
575            // but the two must treat formatting characters quite differently. Thus, we are staying
576            // away from both "first" and "last" in these method names to avoid confusion.
577            charIndex = 0;
578            int embeddingLevel = 0;
579            int embeddingLevelDir = DIR_UNKNOWN;
580            int firstNonEmptyEmbeddingLevel = 0;
581            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
582                switch (dirTypeForward()) {
583                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
584                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
585                        ++embeddingLevel;
586                        embeddingLevelDir = DIR_LTR;
587                        break;
588                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
589                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
590                        ++embeddingLevel;
591                        embeddingLevelDir = DIR_RTL;
592                        break;
593                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
594                        --embeddingLevel;
595                        // To restore embeddingLevelDir to its previous value, we would need a
596                        // stack, which we want to avoid. Thus, at this point we do not know the
597                        // current embedding's directionality.
598                        embeddingLevelDir = DIR_UNKNOWN;
599                        break;
600                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
601                        break;
602                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
603                        if (embeddingLevel == 0) {
604                            return DIR_LTR;
605                        }
606                        firstNonEmptyEmbeddingLevel = embeddingLevel;
607                        break;
608                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
609                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
610                        if (embeddingLevel == 0) {
611                            return DIR_RTL;
612                        }
613                        firstNonEmptyEmbeddingLevel = embeddingLevel;
614                        break;
615                    default:
616                        firstNonEmptyEmbeddingLevel = embeddingLevel;
617                        break;
618                }
619            }
620
621            // We have either found a non-empty embedding or scanned the entire string finding
622            // neither a non-empty embedding nor a strong character outside of an embedding.
623            if (firstNonEmptyEmbeddingLevel == 0) {
624                // We have not found a non-empty embedding. Thus, the string contains neither a
625                // non-empty embedding nor a strong character outside of an embedding.
626                return DIR_UNKNOWN;
627            }
628
629            // We have found a non-empty embedding.
630            if (embeddingLevelDir != DIR_UNKNOWN) {
631                // We know the directionality of the non-empty embedding.
632                return embeddingLevelDir;
633            }
634
635            // We do not remember the directionality of the non-empty embedding we found. So, we go
636            // backwards to find the start of the non-empty embedding and get its directionality.
637            while (charIndex > 0) {
638                switch (dirTypeBackward()) {
639                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
640                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
641                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
642                            return DIR_LTR;
643                        }
644                        --embeddingLevel;
645                        break;
646                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
647                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
648                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
649                            return DIR_RTL;
650                        }
651                        --embeddingLevel;
652                        break;
653                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
654                        ++embeddingLevel;
655                        break;
656                }
657            }
658            // We should never get here.
659            return DIR_UNKNOWN;
660        }
661
662        /**
663         * Returns the directionality of the last character with strong directionality in the
664         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
665         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
666         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
667         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
668         */
669        int getExitDir() {
670            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
671            // strong" sounds like the exact opposite of "first strong", which is a commonly used
672            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
673            // must treat formatting characters quite differently. Thus, we are staying away from
674            // both "first" and "last" in these method names to avoid confusion.
675            charIndex = length;
676            int embeddingLevel = 0;
677            int lastNonEmptyEmbeddingLevel = 0;
678            while (charIndex > 0) {
679                switch (dirTypeBackward()) {
680                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
681                        if (embeddingLevel == 0) {
682                            return DIR_LTR;
683                        }
684                        if (lastNonEmptyEmbeddingLevel == 0) {
685                            lastNonEmptyEmbeddingLevel = embeddingLevel;
686                        }
687                        break;
688                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
689                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
690                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
691                            return DIR_LTR;
692                        }
693                        --embeddingLevel;
694                        break;
695                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
696                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
697                        if (embeddingLevel == 0) {
698                            return DIR_RTL;
699                        }
700                        if (lastNonEmptyEmbeddingLevel == 0) {
701                            lastNonEmptyEmbeddingLevel = embeddingLevel;
702                        }
703                        break;
704                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
705                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
706                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
707                            return DIR_RTL;
708                        }
709                        --embeddingLevel;
710                        break;
711                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
712                        ++embeddingLevel;
713                        break;
714                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
715                        break;
716                    default:
717                        if (lastNonEmptyEmbeddingLevel == 0) {
718                            lastNonEmptyEmbeddingLevel = embeddingLevel;
719                        }
720                        break;
721                }
722            }
723            return DIR_UNKNOWN;
724        }
725
726        // Internal methods
727
728        /**
729         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
730         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
731         * cache.
732         */
733        private static byte getCachedDirectionality(char c) {
734            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c);
735        }
736
737        /**
738         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
739         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
740         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
741         * figure out the actual character, and return its dirtype, but treating it as whitespace is
742         * good enough for our purposes.
743         *
744         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
745         */
746        byte dirTypeForward() {
747            lastChar = text.charAt(charIndex);
748            if (Character.isHighSurrogate(lastChar)) {
749                int codePoint = Character.codePointAt(text, charIndex);
750                charIndex += Character.charCount(codePoint);
751                return Character.getDirectionality(codePoint);
752            }
753            charIndex++;
754            byte dirType = getCachedDirectionality(lastChar);
755            if (isHtml) {
756                // Process tags and entities.
757                if (lastChar == '<') {
758                    dirType = skipTagForward();
759                } else if (lastChar == '&') {
760                    dirType = skipEntityForward();
761                }
762            }
763            return dirType;
764        }
765
766        /**
767         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
768         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
769         * entity, advances over the whole tag/entity and returns
770         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
771         * actual character, and return its dirtype, but treating it as whitespace is good enough
772         * for our purposes.
773         *
774         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
775         */
776        byte dirTypeBackward() {
777            lastChar = text.charAt(charIndex - 1);
778            if (Character.isLowSurrogate(lastChar)) {
779                int codePoint = Character.codePointBefore(text, charIndex);
780                charIndex -= Character.charCount(codePoint);
781                return Character.getDirectionality(codePoint);
782            }
783            charIndex--;
784            byte dirType = getCachedDirectionality(lastChar);
785            if (isHtml) {
786                // Process tags and entities.
787                if (lastChar == '>') {
788                    dirType = skipTagBackward();
789                } else if (lastChar == ';') {
790                    dirType = skipEntityBackward();
791                }
792            }
793            return dirType;
794        }
795
796        /**
797         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
798         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
799         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
800         * &lt; that hadn't been part of a tag after all).
801         */
802        private byte skipTagForward() {
803            int initialCharIndex = charIndex;
804            while (charIndex < length) {
805                lastChar = text.charAt(charIndex++);
806                if (lastChar == '>') {
807                    // The end of the tag.
808                    return Character.DIRECTIONALITY_WHITESPACE;
809                }
810                if (lastChar == '"' || lastChar == '\'') {
811                    // Skip over a quoted attribute value inside the tag.
812                    char quote = lastChar;
813                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
814                }
815            }
816            // The original '<' wasn't the start of a tag after all.
817            charIndex = initialCharIndex;
818            lastChar = '<';
819            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
820        }
821
822        /**
823         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
824         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
825         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
826         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
827         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
828         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
829         * when it encounters another &gt;.
830         */
831        private byte skipTagBackward() {
832            int initialCharIndex = charIndex;
833            while (charIndex > 0) {
834                lastChar = text.charAt(--charIndex);
835                if (lastChar == '<') {
836                    // The start of the tag.
837                    return Character.DIRECTIONALITY_WHITESPACE;
838                }
839                if (lastChar == '>') {
840                    break;
841                }
842                if (lastChar == '"' || lastChar == '\'') {
843                    // Skip over a quoted attribute value inside the tag.
844                    char quote = lastChar;
845                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
846                }
847            }
848            // The original '>' wasn't the end of a tag after all.
849            charIndex = initialCharIndex;
850            lastChar = '>';
851            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
852        }
853
854        /**
855         * Advances charIndex forward through an HTML character entity tag (after the opening
856         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
857         * best to figure out the actual character and return its dirtype, but this is good enough.
858         */
859        private byte skipEntityForward() {
860            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
861            return Character.DIRECTIONALITY_WHITESPACE;
862        }
863
864        /**
865         * Advances charIndex backward through an HTML character entity tag (after the closing ;
866         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
867         * to figure out the actual character and return its dirtype, but this is good enough.
868         * If there is no matching &amp;, does not change charIndex and returns
869         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
870         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
871         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
872         * also stops looking for a matching &amp; when it encounters another ;.
873         */
874        private byte skipEntityBackward() {
875            int initialCharIndex = charIndex;
876            while (charIndex > 0) {
877                lastChar = text.charAt(--charIndex);
878                if (lastChar == '&') {
879                    return Character.DIRECTIONALITY_WHITESPACE;
880                }
881                if (lastChar == ';') {
882                    break;
883                }
884            }
885            charIndex = initialCharIndex;
886            lastChar = ';';
887            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
888        }
889    }
890}