1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.support.v4.text;
18
19import android.support.v4.view.ViewCompat;
20
21import java.util.Locale;
22
23import static android.support.v4.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR;
24
25/**
26 * Utility class for formatting text for display in a potentially opposite-directionality context
27 * without garbling. The directionality of the context is set at formatter creation and the
28 * directionality of the text can be either estimated or passed in when known. Provides the
29 * following functionality:
30 * <p>
31 * 1. Bidi Wrapping
32 * When text in one language is mixed into a document in another, opposite-directionality language,
33 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string
34 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
35 * separated from the surrounding text in a "wrapper" that:
36 * <p>
37 * - Declares its directionality so that the string is displayed correctly. This can be done in
38 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
39 * <p>
40 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
41 *   Currently, this can only be done using invisible Unicode characters of the same direction as
42 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
43 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
44 *   string. Without "reset" after the string, the string will "stick" to a number or logically
45 *   separate opposite-direction text that happens to follow it in-line (even if separated by
46 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
47 *   happen there, but only with more opposite-direction text, not a number. One approach is to
48 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
49 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
50 *   the "reset" only before each string definitely does not work because we do not want to require
51 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
52 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
53 *   message translations often contain untranslated Latin-script brand names and technical terms,
54 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
55 *   has such a message, it is best to do the "reset" manually in the message translation itself,
56 *   since the message's opposite-direction text could be followed by an inserted number, which we
57 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
58 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
59 *   isolation to be part of the directionality declaration. This form of isolation is better than
60 *   "reset" because it takes less space, does not require knowing the context directionality, has a
61 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
62 *   using it because required platforms do not yet support it.
63 * <p>
64 * Providing these wrapping services is the basic purpose of the bidi formatter.
65 * <p>
66 * 2. Directionality estimation
67 * How does one know whether a string about to be inserted into surrounding text has the same
68 * directionality? Well, in many cases, one knows that this must be the case when writing the code
69 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
70 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
71 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
72 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
73 * language of the string (and thus its directionality) is not known a priori, and must be
74 * estimated at run-time. The bidi formatter can do this automatically using the default
75 * first-strong estimation algorithm. It can also be configured to use a custom directionality
76 * estimation object.
77 */
78public final class BidiFormatter {
79
80    /**
81     * The default text direction heuristic.
82     */
83    private static TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
84
85    /**
86     * Unicode "Left-To-Right Embedding" (LRE) character.
87     */
88    private static final char LRE = '\u202A';
89
90    /**
91     * Unicode "Right-To-Left Embedding" (RLE) character.
92     */
93    private static final char RLE = '\u202B';
94
95    /**
96     * Unicode "Pop Directional Formatting" (PDF) character.
97     */
98    private static final char PDF = '\u202C';
99
100    /**
101     *  Unicode "Left-To-Right Mark" (LRM) character.
102     */
103    private static final char LRM = '\u200E';
104
105    /*
106     * Unicode "Right-To-Left Mark" (RLM) character.
107     */
108    private static final char RLM = '\u200F';
109
110    /*
111     * String representation of LRM
112     */
113    private static final String LRM_STRING = Character.toString(LRM);
114
115    /*
116     * String representation of RLM
117     */
118    private static final String RLM_STRING = Character.toString(RLM);
119
120    /**
121     * Empty string constant.
122     */
123    private static final String EMPTY_STRING = "";
124
125    /**
126     * A class for building a BidiFormatter with non-default options.
127     */
128    public static final class Builder {
129        private boolean mIsRtlContext;
130        private int mFlags;
131        private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat;
132
133        /**
134         * Constructor.
135         *
136         */
137        public Builder() {
138            initialize(isRtlLocale(Locale.getDefault()));
139        }
140
141        /**
142         * Constructor.
143         *
144         * @param rtlContext Whether the context directionality is RTL.
145         */
146        public Builder(boolean rtlContext) {
147            initialize(rtlContext);
148        }
149
150        /**
151         * Constructor.
152         *
153         * @param locale The context locale.
154         */
155        public Builder(Locale locale) {
156            initialize(isRtlLocale(locale));
157        }
158
159        /**
160         * Initializes the builder with the given context directionality and default options.
161         *
162         * @param isRtlContext Whether the context is RTL or not.
163         */
164        private void initialize(boolean isRtlContext) {
165            mIsRtlContext = isRtlContext;
166            mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC;
167            mFlags = DEFAULT_FLAGS;
168        }
169
170        /**
171         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
172         * a string being bidi-wrapped, not just after it. The default is false.
173         */
174        public Builder stereoReset(boolean stereoReset) {
175            if (stereoReset) {
176                mFlags |= FLAG_STEREO_RESET;
177            } else {
178                mFlags &= ~FLAG_STEREO_RESET;
179            }
180            return this;
181        }
182
183        /**
184         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
185         * By default, uses the first-strong heuristic.
186         *
187         * @param heuristic the {@code TextDirectionHeuristic} to use.
188         * @return the builder itself.
189         */
190        public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) {
191            mTextDirectionHeuristicCompat = heuristic;
192            return this;
193        }
194
195        private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
196            return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
197        }
198
199        /**
200         * @return A BidiFormatter with the specified options.
201         */
202        public BidiFormatter build() {
203            if (mFlags == DEFAULT_FLAGS &&
204                    mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
205                return getDefaultInstanceFromContext(mIsRtlContext);
206            }
207            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat);
208        }
209    }
210
211    //
212    private static final int FLAG_STEREO_RESET = 2;
213    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
214
215    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
216            false /* LTR context */,
217            DEFAULT_FLAGS,
218            DEFAULT_TEXT_DIRECTION_HEURISTIC);
219
220    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
221            true /* RTL context */,
222            DEFAULT_FLAGS,
223            DEFAULT_TEXT_DIRECTION_HEURISTIC);
224
225    private final boolean mIsRtlContext;
226    private final int mFlags;
227    private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat;
228
229    /**
230     * Factory for creating an instance of BidiFormatter for the default locale directionality.
231     *
232     */
233    public static BidiFormatter getInstance() {
234        return new Builder().build();
235    }
236
237    /**
238     * Factory for creating an instance of BidiFormatter given the context directionality.
239     *
240     * @param rtlContext Whether the context directionality is RTL.
241     */
242    public static BidiFormatter getInstance(boolean rtlContext) {
243        return new Builder(rtlContext).build();
244    }
245
246    /**
247     * Factory for creating an instance of BidiFormatter given the context locale.
248     *
249     * @param locale The context locale.
250     */
251    public static BidiFormatter getInstance(Locale locale) {
252        return new Builder(locale).build();
253    }
254
255    /**
256     * @param isRtlContext Whether the context directionality is RTL or not.
257     * @param flags The option flags.
258     * @param heuristic The default text direction heuristic.
259     */
260    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) {
261        mIsRtlContext = isRtlContext;
262        mFlags = flags;
263        mDefaultTextDirectionHeuristicCompat = heuristic;
264    }
265
266    /**
267     * @return Whether the context directionality is RTL
268     */
269    public boolean isRtlContext() {
270        return mIsRtlContext;
271    }
272
273    /**
274     * @return Whether directionality "reset" should also be done before a string being
275     * bidi-wrapped, not just after it.
276     */
277    public boolean getStereoReset() {
278        return (mFlags & FLAG_STEREO_RESET) != 0;
279    }
280
281    /**
282     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
283     * overall or the exit directionality of a given string is opposite to the context directionality.
284     * Putting this after the string (including its directionality declaration wrapping) prevents it
285     * from "sticking" to other opposite-directionality text or a number appearing after it inline
286     * with only neutral content in between. Otherwise returns the empty string. While the exit
287     * directionality is determined by scanning the end of the string, the overall directionality is
288     * given explicitly by a heuristic to estimate the {@code str}'s directionality.
289     *
290     * @param str String after which the mark may need to appear.
291     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
292     *                  directionality.
293     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
294     *     else, the empty string.
295     */
296    private String markAfter(String str, TextDirectionHeuristicCompat heuristic) {
297        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
298        // getExitDir() is called only if needed (short-circuit).
299        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
300            return LRM_STRING;
301        }
302        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
303            return RLM_STRING;
304        }
305        return EMPTY_STRING;
306    }
307
308    /**
309     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
310     * overall or the entry directionality of a given string is opposite to the context
311     * directionality. Putting this before the string (including its directionality declaration
312     * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
313     * it inline with only neutral content in between. Otherwise returns the empty string. While the
314     * entry directionality is determined by scanning the beginning of the string, the overall
315     * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
316     *
317     * @param str String before which the mark may need to appear.
318     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
319     *                  directionality.
320     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
321     *     else, the empty string.
322     */
323    private String markBefore(String str, TextDirectionHeuristicCompat heuristic) {
324        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
325        // getEntryDir() is called only if needed (short-circuit).
326        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
327            return LRM_STRING;
328        }
329        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
330            return RLM_STRING;
331        }
332        return EMPTY_STRING;
333    }
334
335    /**
336     * Estimates the directionality of a string using the default text direction heuristic.
337     *
338     * @param str String whose directionality is to be estimated.
339     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
340     *          false.
341     */
342    public boolean isRtl(String str) {
343        return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length());
344    }
345
346    /**
347     * Formats a string of given directionality for use in plain-text output of the context
348     * directionality, so an opposite-directionality string is neither garbled nor garbles its
349     * surroundings. This makes use of Unicode bidi formatting characters.
350     * <p>
351     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
352     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
353     * LRE+{@code str}+PDF for LTR text.
354     * <p>
355     * If {@code isolate}, directionally isolates the string so that it does not garble its
356     * surroundings. Currently, this is done by "resetting" the directionality after the string by
357     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
358     * either the overall directionality or the exit directionality of the string is opposite to that
359     * of the context. If the formatter was built using {@link Builder#stereoReset(boolean)} and
360     * passing "true" as an argument, also prepends a Unicode bidi mark matching the context
361     * directionality when either the overall directionality or the entry directionality of the
362     * string is opposite to that of the context. Note that as opposed to the overall
363     * directionality, the entry and exit directionalities are determined from the string itself.
364     * <p>
365     * Does *not* do HTML-escaping.
366     *
367     * @param str The input string.
368     * @param heuristic The algorithm to be used to estimate the string's overall direction.
369     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
370     *     content around it
371     * @return Input string after applying the above processing.
372     */
373    public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) {
374        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
375        StringBuilder result = new StringBuilder();
376        if (getStereoReset() && isolate) {
377            result.append(markBefore(str,
378                    isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
379        }
380        if (isRtl != mIsRtlContext) {
381            result.append(isRtl ? RLE : LRE);
382            result.append(str);
383            result.append(PDF);
384        } else {
385            result.append(str);
386        }
387        if (isolate) {
388            result.append(markAfter(str,
389                    isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
390        }
391        return result.toString();
392    }
393
394    /**
395     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes
396     * {@code isolate} is true.
397     *
398     * @param str The input string.
399     * @param heuristic The algorithm to be used to estimate the string's overall direction.
400     * @return Input string after applying the above processing.
401     */
402    public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) {
403        return unicodeWrap(str, heuristic, true /* isolate */);
404    }
405
406    /**
407     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the
408     * formatter's default direction estimation algorithm.
409     *
410     * @param str The input string.
411     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
412     *     content around it
413     * @return Input string after applying the above processing.
414     */
415    public String unicodeWrap(String str, boolean isolate) {
416        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate);
417    }
418
419    /**
420     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the
421     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
422     *
423     * @param str The input string.
424     * @return Input string after applying the above processing.
425     */
426    public String unicodeWrap(String str) {
427        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */);
428    }
429
430    /**
431     * Helper method to return true if the Locale directionality is RTL.
432     *
433     * @param locale The Locale whose directionality will be checked to be RTL or LTR
434     * @return true if the {@code locale} directionality is RTL. False otherwise.
435     */
436    private static boolean isRtlLocale(Locale locale) {
437        return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL);
438    }
439
440    /**
441     * Enum for directionality type.
442     */
443    private static final int DIR_LTR = -1;
444    private static final int DIR_UNKNOWN = 0;
445    private static final int DIR_RTL = +1;
446
447    /**
448     * Returns the directionality of the last character with strong directionality in the string, or
449     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
450     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
451     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
452     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
453     * whether a logically separate item that starts with a number or a character of the string's
454     * exit directionality and follows this string inline (not counting any neutral characters in
455     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
456     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
457     * between the two will prevent such sticking.
458     *
459     * @param str the string to check.
460     */
461    private static int getExitDir(String str) {
462        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
463    }
464
465    /**
466     * Returns the directionality of the first character with strong directionality in the string,
467     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
468     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
469     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
470     * characters. The intended use is to check whether a logically separate item that ends with a
471     * character of the string's entry directionality and precedes the string inline (not counting
472     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
473     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
474     * context's directionality) between the two will prevent such sticking.
475     *
476     * @param str the string to check.
477     */
478    private static int getEntryDir(String str) {
479        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
480    }
481
482    /**
483     * An object that estimates the directionality of a given string by various methods.
484     *
485     */
486    private static class DirectionalityEstimator {
487
488        // Internal static variables and constants.
489
490        /**
491         * Size of the bidi character class cache. The results of the Character.getDirectionality()
492         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
493         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
494         * cache. It can be reduced to 0x180, restricting the cache to the Western European
495         * languages.
496         */
497        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
498
499        /**
500         * The bidi character class cache.
501         */
502        private static final byte DIR_TYPE_CACHE[];
503
504        static {
505            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
506            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
507                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
508            }
509        }
510
511        // Internal instance variables.
512
513        /**
514         * The text to be scanned.
515         */
516        private final String text;
517
518        /**
519         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
520         * entities when looking for the next / preceding dir type.
521         */
522        private final boolean isHtml;
523
524        /**
525         * The length of the text in chars.
526         */
527        private final int length;
528
529        /**
530         * The current position in the text.
531         */
532        private int charIndex;
533
534        /**
535         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
536         * encountered a supplementary codepoint, this contains a char that is not a valid
537         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
538         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
539         */
540        private char lastChar;
541
542        /**
543         * Constructor.
544         *
545         * @param text The string to scan.
546         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
547         *     tags and entities.
548         */
549        DirectionalityEstimator(String text, boolean isHtml) {
550            this.text = text;
551            this.isHtml = isHtml;
552            length = text.length();
553        }
554
555        /**
556         * Returns the directionality of the first character with strong directionality in the
557         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
558         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
559         * after RLE/RLO. The results are undefined for a string containing unbalanced
560         * LRE/RLE/LRO/RLO/PDF characters.
561         */
562        int getEntryDir() {
563            // The reason for this method name, as opposed to getFirstStrongDir(), is that
564            // "first strong" is a commonly used description of Unicode's estimation algorithm,
565            // but the two must treat formatting characters quite differently. Thus, we are staying
566            // away from both "first" and "last" in these method names to avoid confusion.
567            charIndex = 0;
568            int embeddingLevel = 0;
569            int embeddingLevelDir = DIR_UNKNOWN;
570            int firstNonEmptyEmbeddingLevel = 0;
571            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
572                switch (dirTypeForward()) {
573                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
574                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
575                        ++embeddingLevel;
576                        embeddingLevelDir = DIR_LTR;
577                        break;
578                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
579                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
580                        ++embeddingLevel;
581                        embeddingLevelDir = DIR_RTL;
582                        break;
583                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
584                        --embeddingLevel;
585                        // To restore embeddingLevelDir to its previous value, we would need a
586                        // stack, which we want to avoid. Thus, at this point we do not know the
587                        // current embedding's directionality.
588                        embeddingLevelDir = DIR_UNKNOWN;
589                        break;
590                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
591                        break;
592                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
593                        if (embeddingLevel == 0) {
594                            return DIR_LTR;
595                        }
596                        firstNonEmptyEmbeddingLevel = embeddingLevel;
597                        break;
598                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
599                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
600                        if (embeddingLevel == 0) {
601                            return DIR_RTL;
602                        }
603                        firstNonEmptyEmbeddingLevel = embeddingLevel;
604                        break;
605                    default:
606                        firstNonEmptyEmbeddingLevel = embeddingLevel;
607                        break;
608                }
609            }
610
611            // We have either found a non-empty embedding or scanned the entire string finding
612            // neither a non-empty embedding nor a strong character outside of an embedding.
613            if (firstNonEmptyEmbeddingLevel == 0) {
614                // We have not found a non-empty embedding. Thus, the string contains neither a
615                // non-empty embedding nor a strong character outside of an embedding.
616                return DIR_UNKNOWN;
617            }
618
619            // We have found a non-empty embedding.
620            if (embeddingLevelDir != DIR_UNKNOWN) {
621                // We know the directionality of the non-empty embedding.
622                return embeddingLevelDir;
623            }
624
625            // We do not remember the directionality of the non-empty embedding we found. So, we go
626            // backwards to find the start of the non-empty embedding and get its directionality.
627            while (charIndex > 0) {
628                switch (dirTypeBackward()) {
629                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
630                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
631                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
632                            return DIR_LTR;
633                        }
634                        --embeddingLevel;
635                        break;
636                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
637                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
638                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
639                            return DIR_RTL;
640                        }
641                        --embeddingLevel;
642                        break;
643                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
644                        ++embeddingLevel;
645                        break;
646                }
647            }
648            // We should never get here.
649            return DIR_UNKNOWN;
650        }
651
652        /**
653         * Returns the directionality of the last character with strong directionality in the
654         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
655         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
656         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
657         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
658         */
659        int getExitDir() {
660            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
661            // strong" sounds like the exact opposite of "first strong", which is a commonly used
662            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
663            // must treat formatting characters quite differently. Thus, we are staying away from
664            // both "first" and "last" in these method names to avoid confusion.
665            charIndex = length;
666            int embeddingLevel = 0;
667            int lastNonEmptyEmbeddingLevel = 0;
668            while (charIndex > 0) {
669                switch (dirTypeBackward()) {
670                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
671                        if (embeddingLevel == 0) {
672                            return DIR_LTR;
673                        }
674                        if (lastNonEmptyEmbeddingLevel == 0) {
675                            lastNonEmptyEmbeddingLevel = embeddingLevel;
676                        }
677                        break;
678                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
679                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
680                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
681                            return DIR_LTR;
682                        }
683                        --embeddingLevel;
684                        break;
685                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
686                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
687                        if (embeddingLevel == 0) {
688                            return DIR_RTL;
689                        }
690                        if (lastNonEmptyEmbeddingLevel == 0) {
691                            lastNonEmptyEmbeddingLevel = embeddingLevel;
692                        }
693                        break;
694                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
695                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
696                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
697                            return DIR_RTL;
698                        }
699                        --embeddingLevel;
700                        break;
701                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
702                        ++embeddingLevel;
703                        break;
704                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
705                        break;
706                    default:
707                        if (lastNonEmptyEmbeddingLevel == 0) {
708                            lastNonEmptyEmbeddingLevel = embeddingLevel;
709                        }
710                        break;
711                }
712            }
713            return DIR_UNKNOWN;
714        }
715
716        // Internal methods
717
718        /**
719         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
720         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
721         * cache.
722         */
723        private static byte getCachedDirectionality(char c) {
724            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c);
725        }
726
727        /**
728         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
729         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
730         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
731         * figure out the actual character, and return its dirtype, but treating it as whitespace is
732         * good enough for our purposes.
733         *
734         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
735         */
736        byte dirTypeForward() {
737            lastChar = text.charAt(charIndex);
738            if (Character.isHighSurrogate(lastChar)) {
739                int codePoint = Character.codePointAt(text, charIndex);
740                charIndex += Character.charCount(codePoint);
741                return Character.getDirectionality(codePoint);
742            }
743            charIndex++;
744            byte dirType = getCachedDirectionality(lastChar);
745            if (isHtml) {
746                // Process tags and entities.
747                if (lastChar == '<') {
748                    dirType = skipTagForward();
749                } else if (lastChar == '&') {
750                    dirType = skipEntityForward();
751                }
752            }
753            return dirType;
754        }
755
756        /**
757         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
758         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
759         * entity, advances over the whole tag/entity and returns
760         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
761         * actual character, and return its dirtype, but treating it as whitespace is good enough
762         * for our purposes.
763         *
764         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
765         */
766        byte dirTypeBackward() {
767            lastChar = text.charAt(charIndex - 1);
768            if (Character.isLowSurrogate(lastChar)) {
769                int codePoint = Character.codePointBefore(text, charIndex);
770                charIndex -= Character.charCount(codePoint);
771                return Character.getDirectionality(codePoint);
772            }
773            charIndex--;
774            byte dirType = getCachedDirectionality(lastChar);
775            if (isHtml) {
776                // Process tags and entities.
777                if (lastChar == '>') {
778                    dirType = skipTagBackward();
779                } else if (lastChar == ';') {
780                    dirType = skipEntityBackward();
781                }
782            }
783            return dirType;
784        }
785
786        /**
787         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
788         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
789         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
790         * &lt; that hadn't been part of a tag after all).
791         */
792        private byte skipTagForward() {
793            int initialCharIndex = charIndex;
794            while (charIndex < length) {
795                lastChar = text.charAt(charIndex++);
796                if (lastChar == '>') {
797                    // The end of the tag.
798                    return Character.DIRECTIONALITY_WHITESPACE;
799                }
800                if (lastChar == '"' || lastChar == '\'') {
801                    // Skip over a quoted attribute value inside the tag.
802                    char quote = lastChar;
803                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
804                }
805            }
806            // The original '<' wasn't the start of a tag after all.
807            charIndex = initialCharIndex;
808            lastChar = '<';
809            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
810        }
811
812        /**
813         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
814         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
815         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
816         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
817         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
818         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
819         * when it encounters another &gt;.
820         */
821        private byte skipTagBackward() {
822            int initialCharIndex = charIndex;
823            while (charIndex > 0) {
824                lastChar = text.charAt(--charIndex);
825                if (lastChar == '<') {
826                    // The start of the tag.
827                    return Character.DIRECTIONALITY_WHITESPACE;
828                }
829                if (lastChar == '>') {
830                    break;
831                }
832                if (lastChar == '"' || lastChar == '\'') {
833                    // Skip over a quoted attribute value inside the tag.
834                    char quote = lastChar;
835                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
836                }
837            }
838            // The original '>' wasn't the end of a tag after all.
839            charIndex = initialCharIndex;
840            lastChar = '>';
841            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
842        }
843
844        /**
845         * Advances charIndex forward through an HTML character entity tag (after the opening
846         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
847         * best to figure out the actual character and return its dirtype, but this is good enough.
848         */
849        private byte skipEntityForward() {
850            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
851            return Character.DIRECTIONALITY_WHITESPACE;
852        }
853
854        /**
855         * Advances charIndex backward through an HTML character entity tag (after the closing ;
856         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
857         * to figure out the actual character and return its dirtype, but this is good enough.
858         * If there is no matching &amp;, does not change charIndex and returns
859         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
860         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
861         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
862         * also stops looking for a matching &amp; when it encounters another ;.
863         */
864        private byte skipEntityBackward() {
865            int initialCharIndex = charIndex;
866            while (charIndex > 0) {
867                lastChar = text.charAt(--charIndex);
868                if (lastChar == '&') {
869                    return Character.DIRECTIONALITY_WHITESPACE;
870                }
871                if (lastChar == ';') {
872                    break;
873                }
874            }
875            charIndex = initialCharIndex;
876            lastChar = ';';
877            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
878        }
879    }
880}