BidiFormatter.java revision c42363ad309d523d65fe8b66d16786a1d372805e
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text;
18
19import android.view.View;
20
21import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR;
22
23import java.util.Locale;
24
25/**
26 * Utility class for formatting text for display in a potentially opposite-directionality context
27 * without garbling. The directionality of the context is set at formatter creation and the
28 * directionality of the text can be either estimated or passed in when known.
29 *
30 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2},
31 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class.
32 *
33 * <p>These APIs provides the following functionality:
34 * <p>
35 * 1. Bidi Wrapping
36 * When text in one language is mixed into a document in another, opposite-directionality language,
37 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string
38 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
39 * separated from the surrounding text in a "wrapper" that:
40 * <p>
41 * - Declares its directionality so that the string is displayed correctly. This can be done in
42 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
43 * <p>
44 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
45 *   Currently, this can only be done using invisible Unicode characters of the same direction as
46 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
47 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
48 *   string. Without "reset" after the string, the string will "stick" to a number or logically
49 *   separate opposite-direction text that happens to follow it in-line (even if separated by
50 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
51 *   happen there, but only with more opposite-direction text, not a number. One approach is to
52 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
53 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
54 *   the "reset" only before each string definitely does not work because we do not want to require
55 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
56 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
57 *   message translations often contain untranslated Latin-script brand names and technical terms,
58 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
59 *   has such a message, it is best to do the "reset" manually in the message translation itself,
60 *   since the message's opposite-direction text could be followed by an inserted number, which we
61 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
62 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
63 *   isolation to be part of the directionality declaration. This form of isolation is better than
64 *   "reset" because it takes less space, does not require knowing the context directionality, has a
65 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
66 *   using it because required platforms do not yet support it.
67 * <p>
68 * Providing these wrapping services is the basic purpose of the bidi formatter.
69 * <p>
70 * 2. Directionality estimation
71 * How does one know whether a string about to be inserted into surrounding text has the same
72 * directionality? Well, in many cases, one knows that this must be the case when writing the code
73 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
74 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
75 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
76 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
77 * language of the string (and thus its directionality) is not known a priori, and must be
78 * estimated at run-time. The bidi formatter can do this automatically using the default
79 * first-strong estimation algorithm. It can also be configured to use a custom directionality
80 * estimation object.
81 */
82public final class BidiFormatter {
83
84    /**
85     * The default text direction heuristic.
86     */
87    private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
88
89    /**
90     * Unicode "Left-To-Right Embedding" (LRE) character.
91     */
92    private static final char LRE = '\u202A';
93
94    /**
95     * Unicode "Right-To-Left Embedding" (RLE) character.
96     */
97    private static final char RLE = '\u202B';
98
99    /**
100     * Unicode "Pop Directional Formatting" (PDF) character.
101     */
102    private static final char PDF = '\u202C';
103
104    /**
105     *  Unicode "Left-To-Right Mark" (LRM) character.
106     */
107    private static final char LRM = '\u200E';
108
109    /*
110     * Unicode "Right-To-Left Mark" (RLM) character.
111     */
112    private static final char RLM = '\u200F';
113
114    /*
115     * String representation of LRM
116     */
117    private static final String LRM_STRING = Character.toString(LRM);
118
119    /*
120     * String representation of RLM
121     */
122    private static final String RLM_STRING = Character.toString(RLM);
123
124    /**
125     * Empty string constant.
126     */
127    private static final String EMPTY_STRING = "";
128
129    /**
130     * A class for building a BidiFormatter with non-default options.
131     */
132    public static final class Builder {
133        private boolean mIsRtlContext;
134        private int mFlags;
135        private TextDirectionHeuristic mTextDirectionHeuristic;
136
137        /**
138         * Constructor.
139         *
140         */
141        public Builder() {
142            initialize(isRtlLocale(Locale.getDefault()));
143        }
144
145        /**
146         * Constructor.
147         *
148         * @param rtlContext Whether the context directionality is RTL.
149         */
150        public Builder(boolean rtlContext) {
151            initialize(rtlContext);
152        }
153
154        /**
155         * Constructor.
156         *
157         * @param locale The context locale.
158         */
159        public Builder(Locale locale) {
160            initialize(isRtlLocale(locale));
161        }
162
163        /**
164         * Initializes the builder with the given context directionality and default options.
165         *
166         * @param isRtlContext Whether the context is RTL or not.
167         */
168        private void initialize(boolean isRtlContext) {
169            mIsRtlContext = isRtlContext;
170            mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC;
171            mFlags = DEFAULT_FLAGS;
172        }
173
174        /**
175         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
176         * a string being bidi-wrapped, not just after it. The default is true.
177         */
178        public Builder stereoReset(boolean stereoReset) {
179            if (stereoReset) {
180                mFlags |= FLAG_STEREO_RESET;
181            } else {
182                mFlags &= ~FLAG_STEREO_RESET;
183            }
184            return this;
185        }
186
187        /**
188         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
189         * By default, uses the first-strong heuristic.
190         *
191         * @param heuristic the {@code TextDirectionHeuristic} to use.
192         * @return the builder itself.
193         */
194        public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) {
195            mTextDirectionHeuristic = heuristic;
196            return this;
197        }
198
199        /**
200         * @return A BidiFormatter with the specified options.
201         */
202        public BidiFormatter build() {
203            if (mFlags == DEFAULT_FLAGS &&
204                    mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
205                return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext);
206            }
207            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic);
208        }
209    }
210
211    //
212    private static final int FLAG_STEREO_RESET = 2;
213    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
214
215    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
216            false /* LTR context */,
217            DEFAULT_FLAGS,
218            DEFAULT_TEXT_DIRECTION_HEURISTIC);
219
220    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
221            true /* RTL context */,
222            DEFAULT_FLAGS,
223            DEFAULT_TEXT_DIRECTION_HEURISTIC);
224
225    private final boolean mIsRtlContext;
226    private final int mFlags;
227    private final TextDirectionHeuristic mDefaultTextDirectionHeuristic;
228
229    /**
230     * Factory for creating an instance of BidiFormatter for the default locale directionality.
231     *
232     * This does not create any new objects, and returns already existing static instances.
233     *
234     */
235    public static BidiFormatter getInstance() {
236        return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault()));
237    }
238
239    /**
240     * Factory for creating an instance of BidiFormatter given the context directionality.
241     *
242     * This does not create any new objects, and returns already existing static instances.
243     *
244     * @param rtlContext Whether the context directionality is RTL.
245     */
246    public static BidiFormatter getInstance(boolean rtlContext) {
247        return getDefaultInstanceFromContext(rtlContext);
248    }
249
250    /**
251     * Factory for creating an instance of BidiFormatter given the context locale.
252     *
253     * This does not create any new objects, and returns already existing static instances.
254     *
255     * @param locale The context locale.
256     */
257    public static BidiFormatter getInstance(Locale locale) {
258        return getDefaultInstanceFromContext(isRtlLocale(locale));
259    }
260
261    /**
262     * @param isRtlContext Whether the context directionality is RTL or not.
263     * @param flags The option flags.
264     * @param heuristic The default text direction heuristic.
265     */
266    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) {
267        mIsRtlContext = isRtlContext;
268        mFlags = flags;
269        mDefaultTextDirectionHeuristic = heuristic;
270    }
271
272    /**
273     * @return Whether the context directionality is RTL
274     */
275    public boolean isRtlContext() {
276        return mIsRtlContext;
277    }
278
279    /**
280     * @return Whether directionality "reset" should also be done before a string being
281     * bidi-wrapped, not just after it.
282     */
283    public boolean getStereoReset() {
284        return (mFlags & FLAG_STEREO_RESET) != 0;
285    }
286
287    /**
288     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
289     * overall or the exit directionality of a given string is opposite to the context directionality.
290     * Putting this after the string (including its directionality declaration wrapping) prevents it
291     * from "sticking" to other opposite-directionality text or a number appearing after it inline
292     * with only neutral content in between. Otherwise returns the empty string. While the exit
293     * directionality is determined by scanning the end of the string, the overall directionality is
294     * given explicitly by a heuristic to estimate the {@code str}'s directionality.
295     *
296     * @param str String after which the mark may need to appear.
297     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
298     *                  directionality.
299     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
300     *     else, the empty string.
301     *
302     * @hide
303     */
304    public String markAfter(String str, TextDirectionHeuristic heuristic) {
305        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
306        // getExitDir() is called only if needed (short-circuit).
307        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
308            return LRM_STRING;
309        }
310        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
311            return RLM_STRING;
312        }
313        return EMPTY_STRING;
314    }
315
316    /**
317     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
318     * overall or the entry directionality of a given string is opposite to the context
319     * directionality. Putting this before the string (including its directionality declaration
320     * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
321     * it inline with only neutral content in between. Otherwise returns the empty string. While the
322     * entry directionality is determined by scanning the beginning of the string, the overall
323     * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
324     *
325     * @param str String before which the mark may need to appear.
326     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
327     *                  directionality.
328     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
329     *     else, the empty string.
330     *
331     * @hide
332     */
333    public String markBefore(String str, TextDirectionHeuristic heuristic) {
334        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
335        // getEntryDir() is called only if needed (short-circuit).
336        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
337            return LRM_STRING;
338        }
339        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
340            return RLM_STRING;
341        }
342        return EMPTY_STRING;
343    }
344
345    /**
346     * Estimates the directionality of a string using the default text direction heuristic.
347     *
348     * @param str String whose directionality is to be estimated.
349     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
350     *          false.
351     */
352    public boolean isRtl(String str) {
353        return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length());
354    }
355
356    /**
357     * Formats a string of given directionality for use in plain-text output of the context
358     * directionality, so an opposite-directionality string is neither garbled nor garbles its
359     * surroundings. This makes use of Unicode bidi formatting characters.
360     * <p>
361     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
362     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
363     * LRE+{@code str}+PDF for LTR text.
364     * <p>
365     * If {@code isolate}, directionally isolates the string so that it does not garble its
366     * surroundings. Currently, this is done by "resetting" the directionality after the string by
367     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
368     * either the overall directionality or the exit directionality of the string is opposite to
369     * that of the context. Unless the formatter was built using
370     * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
371     * bidi mark matching the context directionality when either the overall directionality or the
372     * entry directionality of the string is opposite to that of the context. Note that as opposed
373     * to the overall directionality, the entry and exit directionalities are determined from the
374     * string itself.
375     * <p>
376     * Does *not* do HTML-escaping.
377     *
378     * @param str The input string.
379     * @param heuristic The algorithm to be used to estimate the string's overall direction.
380     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
381     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
382     *     content around it
383     * @return Input string after applying the above processing. {@code null} if {@code str} is
384     *     {@code null}.
385     */
386    public String unicodeWrap(String str, TextDirectionHeuristic heuristic, boolean isolate) {
387        if (str == null) return null;
388        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
389        StringBuilder result = new StringBuilder();
390        if (getStereoReset() && isolate) {
391            result.append(markBefore(str,
392                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
393        }
394        if (isRtl != mIsRtlContext) {
395            result.append(isRtl ? RLE : LRE);
396            result.append(str);
397            result.append(PDF);
398        } else {
399            result.append(str);
400        }
401        if (isolate) {
402            result.append(markAfter(str,
403                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
404        }
405        return result.toString();
406    }
407
408    /**
409     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes
410     * {@code isolate} is true.
411     *
412     * @param str The input string.
413     * @param heuristic The algorithm to be used to estimate the string's overall direction.
414     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
415     * @return Input string after applying the above processing.
416     */
417    public String unicodeWrap(String str, TextDirectionHeuristic heuristic) {
418        return unicodeWrap(str, heuristic, true /* isolate */);
419    }
420
421    /**
422     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
423     * formatter's default direction estimation algorithm.
424     *
425     * @param str The input string.
426     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
427     *     content around it
428     * @return Input string after applying the above processing.
429     */
430    public String unicodeWrap(String str, boolean isolate) {
431        return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
432    }
433
434    /**
435     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
436     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
437     *
438     * @param str The input string.
439     * @return Input string after applying the above processing.
440     */
441    public String unicodeWrap(String str) {
442        return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
443    }
444
445    private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
446        return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
447    }
448
449    /**
450     * Helper method to return true if the Locale directionality is RTL.
451     *
452     * @param locale The Locale whose directionality will be checked to be RTL or LTR
453     * @return true if the {@code locale} directionality is RTL. False otherwise.
454     */
455    private static boolean isRtlLocale(Locale locale) {
456        return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL);
457    }
458
459    /**
460     * Enum for directionality type.
461     */
462    private static final int DIR_LTR = -1;
463    private static final int DIR_UNKNOWN = 0;
464    private static final int DIR_RTL = +1;
465
466    /**
467     * Returns the directionality of the last character with strong directionality in the string, or
468     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
469     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
470     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
471     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
472     * whether a logically separate item that starts with a number or a character of the string's
473     * exit directionality and follows this string inline (not counting any neutral characters in
474     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
475     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
476     * between the two will prevent such sticking.
477     *
478     * @param str the string to check.
479     */
480    private static int getExitDir(String str) {
481        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
482    }
483
484    /**
485     * Returns the directionality of the first character with strong directionality in the string,
486     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
487     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
488     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
489     * characters. The intended use is to check whether a logically separate item that ends with a
490     * character of the string's entry directionality and precedes the string inline (not counting
491     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
492     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
493     * context's directionality) between the two will prevent such sticking.
494     *
495     * @param str the string to check.
496     */
497    private static int getEntryDir(String str) {
498        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
499    }
500
501    /**
502     * An object that estimates the directionality of a given string by various methods.
503     *
504     */
505    private static class DirectionalityEstimator {
506
507        // Internal static variables and constants.
508
509        /**
510         * Size of the bidi character class cache. The results of the Character.getDirectionality()
511         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
512         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
513         * cache. It can be reduced to 0x180, restricting the cache to the Western European
514         * languages.
515         */
516        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
517
518        /**
519         * The bidi character class cache.
520         */
521        private static final byte DIR_TYPE_CACHE[];
522
523        static {
524            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
525            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
526                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
527            }
528        }
529
530        // Internal instance variables.
531
532        /**
533         * The text to be scanned.
534         */
535        private final String text;
536
537        /**
538         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
539         * entities when looking for the next / preceding dir type.
540         */
541        private final boolean isHtml;
542
543        /**
544         * The length of the text in chars.
545         */
546        private final int length;
547
548        /**
549         * The current position in the text.
550         */
551        private int charIndex;
552
553        /**
554         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
555         * encountered a supplementary codepoint, this contains a char that is not a valid
556         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
557         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
558         */
559        private char lastChar;
560
561        /**
562         * Constructor.
563         *
564         * @param text The string to scan.
565         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
566         *     tags and entities.
567         */
568        DirectionalityEstimator(String text, boolean isHtml) {
569            this.text = text;
570            this.isHtml = isHtml;
571            length = text.length();
572        }
573
574        /**
575         * Returns the directionality of the first character with strong directionality in the
576         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
577         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
578         * after RLE/RLO. The results are undefined for a string containing unbalanced
579         * LRE/RLE/LRO/RLO/PDF characters.
580         */
581        int getEntryDir() {
582            // The reason for this method name, as opposed to getFirstStrongDir(), is that
583            // "first strong" is a commonly used description of Unicode's estimation algorithm,
584            // but the two must treat formatting characters quite differently. Thus, we are staying
585            // away from both "first" and "last" in these method names to avoid confusion.
586            charIndex = 0;
587            int embeddingLevel = 0;
588            int embeddingLevelDir = DIR_UNKNOWN;
589            int firstNonEmptyEmbeddingLevel = 0;
590            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
591                switch (dirTypeForward()) {
592                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
593                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
594                        ++embeddingLevel;
595                        embeddingLevelDir = DIR_LTR;
596                        break;
597                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
598                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
599                        ++embeddingLevel;
600                        embeddingLevelDir = DIR_RTL;
601                        break;
602                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
603                        --embeddingLevel;
604                        // To restore embeddingLevelDir to its previous value, we would need a
605                        // stack, which we want to avoid. Thus, at this point we do not know the
606                        // current embedding's directionality.
607                        embeddingLevelDir = DIR_UNKNOWN;
608                        break;
609                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
610                        break;
611                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
612                        if (embeddingLevel == 0) {
613                            return DIR_LTR;
614                        }
615                        firstNonEmptyEmbeddingLevel = embeddingLevel;
616                        break;
617                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
618                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
619                        if (embeddingLevel == 0) {
620                            return DIR_RTL;
621                        }
622                        firstNonEmptyEmbeddingLevel = embeddingLevel;
623                        break;
624                    default:
625                        firstNonEmptyEmbeddingLevel = embeddingLevel;
626                        break;
627                }
628            }
629
630            // We have either found a non-empty embedding or scanned the entire string finding
631            // neither a non-empty embedding nor a strong character outside of an embedding.
632            if (firstNonEmptyEmbeddingLevel == 0) {
633                // We have not found a non-empty embedding. Thus, the string contains neither a
634                // non-empty embedding nor a strong character outside of an embedding.
635                return DIR_UNKNOWN;
636            }
637
638            // We have found a non-empty embedding.
639            if (embeddingLevelDir != DIR_UNKNOWN) {
640                // We know the directionality of the non-empty embedding.
641                return embeddingLevelDir;
642            }
643
644            // We do not remember the directionality of the non-empty embedding we found. So, we go
645            // backwards to find the start of the non-empty embedding and get its directionality.
646            while (charIndex > 0) {
647                switch (dirTypeBackward()) {
648                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
649                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
650                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
651                            return DIR_LTR;
652                        }
653                        --embeddingLevel;
654                        break;
655                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
656                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
657                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
658                            return DIR_RTL;
659                        }
660                        --embeddingLevel;
661                        break;
662                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
663                        ++embeddingLevel;
664                        break;
665                }
666            }
667            // We should never get here.
668            return DIR_UNKNOWN;
669        }
670
671        /**
672         * Returns the directionality of the last character with strong directionality in the
673         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
674         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
675         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
676         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
677         */
678        int getExitDir() {
679            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
680            // strong" sounds like the exact opposite of "first strong", which is a commonly used
681            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
682            // must treat formatting characters quite differently. Thus, we are staying away from
683            // both "first" and "last" in these method names to avoid confusion.
684            charIndex = length;
685            int embeddingLevel = 0;
686            int lastNonEmptyEmbeddingLevel = 0;
687            while (charIndex > 0) {
688                switch (dirTypeBackward()) {
689                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
690                        if (embeddingLevel == 0) {
691                            return DIR_LTR;
692                        }
693                        if (lastNonEmptyEmbeddingLevel == 0) {
694                            lastNonEmptyEmbeddingLevel = embeddingLevel;
695                        }
696                        break;
697                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
698                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
699                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
700                            return DIR_LTR;
701                        }
702                        --embeddingLevel;
703                        break;
704                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
705                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
706                        if (embeddingLevel == 0) {
707                            return DIR_RTL;
708                        }
709                        if (lastNonEmptyEmbeddingLevel == 0) {
710                            lastNonEmptyEmbeddingLevel = embeddingLevel;
711                        }
712                        break;
713                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
714                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
715                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
716                            return DIR_RTL;
717                        }
718                        --embeddingLevel;
719                        break;
720                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
721                        ++embeddingLevel;
722                        break;
723                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
724                        break;
725                    default:
726                        if (lastNonEmptyEmbeddingLevel == 0) {
727                            lastNonEmptyEmbeddingLevel = embeddingLevel;
728                        }
729                        break;
730                }
731            }
732            return DIR_UNKNOWN;
733        }
734
735        // Internal methods
736
737        /**
738         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
739         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
740         * cache.
741         */
742        private static byte getCachedDirectionality(char c) {
743            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c);
744        }
745
746        /**
747         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
748         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
749         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
750         * figure out the actual character, and return its dirtype, but treating it as whitespace is
751         * good enough for our purposes.
752         *
753         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
754         */
755        byte dirTypeForward() {
756            lastChar = text.charAt(charIndex);
757            if (Character.isHighSurrogate(lastChar)) {
758                int codePoint = Character.codePointAt(text, charIndex);
759                charIndex += Character.charCount(codePoint);
760                return Character.getDirectionality(codePoint);
761            }
762            charIndex++;
763            byte dirType = getCachedDirectionality(lastChar);
764            if (isHtml) {
765                // Process tags and entities.
766                if (lastChar == '<') {
767                    dirType = skipTagForward();
768                } else if (lastChar == '&') {
769                    dirType = skipEntityForward();
770                }
771            }
772            return dirType;
773        }
774
775        /**
776         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
777         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
778         * entity, advances over the whole tag/entity and returns
779         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
780         * actual character, and return its dirtype, but treating it as whitespace is good enough
781         * for our purposes.
782         *
783         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
784         */
785        byte dirTypeBackward() {
786            lastChar = text.charAt(charIndex - 1);
787            if (Character.isLowSurrogate(lastChar)) {
788                int codePoint = Character.codePointBefore(text, charIndex);
789                charIndex -= Character.charCount(codePoint);
790                return Character.getDirectionality(codePoint);
791            }
792            charIndex--;
793            byte dirType = getCachedDirectionality(lastChar);
794            if (isHtml) {
795                // Process tags and entities.
796                if (lastChar == '>') {
797                    dirType = skipTagBackward();
798                } else if (lastChar == ';') {
799                    dirType = skipEntityBackward();
800                }
801            }
802            return dirType;
803        }
804
805        /**
806         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
807         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
808         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
809         * &lt; that hadn't been part of a tag after all).
810         */
811        private byte skipTagForward() {
812            int initialCharIndex = charIndex;
813            while (charIndex < length) {
814                lastChar = text.charAt(charIndex++);
815                if (lastChar == '>') {
816                    // The end of the tag.
817                    return Character.DIRECTIONALITY_WHITESPACE;
818                }
819                if (lastChar == '"' || lastChar == '\'') {
820                    // Skip over a quoted attribute value inside the tag.
821                    char quote = lastChar;
822                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
823                }
824            }
825            // The original '<' wasn't the start of a tag after all.
826            charIndex = initialCharIndex;
827            lastChar = '<';
828            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
829        }
830
831        /**
832         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
833         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
834         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
835         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
836         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
837         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
838         * when it encounters another &gt;.
839         */
840        private byte skipTagBackward() {
841            int initialCharIndex = charIndex;
842            while (charIndex > 0) {
843                lastChar = text.charAt(--charIndex);
844                if (lastChar == '<') {
845                    // The start of the tag.
846                    return Character.DIRECTIONALITY_WHITESPACE;
847                }
848                if (lastChar == '>') {
849                    break;
850                }
851                if (lastChar == '"' || lastChar == '\'') {
852                    // Skip over a quoted attribute value inside the tag.
853                    char quote = lastChar;
854                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
855                }
856            }
857            // The original '>' wasn't the end of a tag after all.
858            charIndex = initialCharIndex;
859            lastChar = '>';
860            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
861        }
862
863        /**
864         * Advances charIndex forward through an HTML character entity tag (after the opening
865         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
866         * best to figure out the actual character and return its dirtype, but this is good enough.
867         */
868        private byte skipEntityForward() {
869            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
870            return Character.DIRECTIONALITY_WHITESPACE;
871        }
872
873        /**
874         * Advances charIndex backward through an HTML character entity tag (after the closing ;
875         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
876         * to figure out the actual character and return its dirtype, but this is good enough.
877         * If there is no matching &amp;, does not change charIndex and returns
878         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
879         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
880         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
881         * also stops looking for a matching &amp; when it encounters another ;.
882         */
883        private byte skipEntityBackward() {
884            int initialCharIndex = charIndex;
885            while (charIndex > 0) {
886                lastChar = text.charAt(--charIndex);
887                if (lastChar == '&') {
888                    return Character.DIRECTIONALITY_WHITESPACE;
889                }
890                if (lastChar == ';') {
891                    break;
892                }
893            }
894            charIndex = initialCharIndex;
895            lastChar = ';';
896            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
897        }
898    }
899}