1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text;
18
19import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR;
20
21import android.annotation.Nullable;
22import android.view.View;
23
24import java.util.Locale;
25
26/**
27 * Utility class for formatting text for display in a potentially opposite-directionality context
28 * without garbling. The directionality of the context is set at formatter creation and the
29 * directionality of the text can be either estimated or passed in when known.
30 *
31 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2},
32 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class.
33 *
34 * <p>These APIs provides the following functionality:
35 * <p>
36 * 1. Bidi Wrapping
37 * When text in one language is mixed into a document in another, opposite-directionality language,
38 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string
39 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
40 * separated from the surrounding text in a "wrapper" that:
41 * <p>
42 * - Declares its directionality so that the string is displayed correctly. This can be done in
43 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
44 * <p>
45 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
46 *   Currently, this can only be done using invisible Unicode characters of the same direction as
47 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
48 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
49 *   string. Without "reset" after the string, the string will "stick" to a number or logically
50 *   separate opposite-direction text that happens to follow it in-line (even if separated by
51 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
52 *   happen there, but only with more opposite-direction text, not a number. One approach is to
53 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
54 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
55 *   the "reset" only before each string definitely does not work because we do not want to require
56 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
57 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
58 *   message translations often contain untranslated Latin-script brand names and technical terms,
59 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
60 *   has such a message, it is best to do the "reset" manually in the message translation itself,
61 *   since the message's opposite-direction text could be followed by an inserted number, which we
62 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
63 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
64 *   isolation to be part of the directionality declaration. This form of isolation is better than
65 *   "reset" because it takes less space, does not require knowing the context directionality, has a
66 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
67 *   using it because required platforms do not yet support it.
68 * <p>
69 * Providing these wrapping services is the basic purpose of the bidi formatter.
70 * <p>
71 * 2. Directionality estimation
72 * How does one know whether a string about to be inserted into surrounding text has the same
73 * directionality? Well, in many cases, one knows that this must be the case when writing the code
74 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
75 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
76 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
77 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
78 * language of the string (and thus its directionality) is not known a priori, and must be
79 * estimated at run-time. The bidi formatter can do this automatically using the default
80 * first-strong estimation algorithm. It can also be configured to use a custom directionality
81 * estimation object.
82 */
83public final class BidiFormatter {
84
85    /**
86     * The default text direction heuristic.
87     */
88    private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
89
90    /**
91     * Unicode "Left-To-Right Embedding" (LRE) character.
92     */
93    private static final char LRE = '\u202A';
94
95    /**
96     * Unicode "Right-To-Left Embedding" (RLE) character.
97     */
98    private static final char RLE = '\u202B';
99
100    /**
101     * Unicode "Pop Directional Formatting" (PDF) character.
102     */
103    private static final char PDF = '\u202C';
104
105    /**
106     *  Unicode "Left-To-Right Mark" (LRM) character.
107     */
108    private static final char LRM = '\u200E';
109
110    /*
111     * Unicode "Right-To-Left Mark" (RLM) character.
112     */
113    private static final char RLM = '\u200F';
114
115    /*
116     * String representation of LRM
117     */
118    private static final String LRM_STRING = Character.toString(LRM);
119
120    /*
121     * String representation of RLM
122     */
123    private static final String RLM_STRING = Character.toString(RLM);
124
125    /**
126     * Empty string constant.
127     */
128    private static final String EMPTY_STRING = "";
129
130    /**
131     * A class for building a BidiFormatter with non-default options.
132     */
133    public static final class Builder {
134        private boolean mIsRtlContext;
135        private int mFlags;
136        private TextDirectionHeuristic mTextDirectionHeuristic;
137
138        /**
139         * Constructor.
140         *
141         */
142        public Builder() {
143            initialize(isRtlLocale(Locale.getDefault()));
144        }
145
146        /**
147         * Constructor.
148         *
149         * @param rtlContext Whether the context directionality is RTL.
150         */
151        public Builder(boolean rtlContext) {
152            initialize(rtlContext);
153        }
154
155        /**
156         * Constructor.
157         *
158         * @param locale The context locale.
159         */
160        public Builder(Locale locale) {
161            initialize(isRtlLocale(locale));
162        }
163
164        /**
165         * Initializes the builder with the given context directionality and default options.
166         *
167         * @param isRtlContext Whether the context is RTL or not.
168         */
169        private void initialize(boolean isRtlContext) {
170            mIsRtlContext = isRtlContext;
171            mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC;
172            mFlags = DEFAULT_FLAGS;
173        }
174
175        /**
176         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
177         * a string being bidi-wrapped, not just after it. The default is true.
178         */
179        public Builder stereoReset(boolean stereoReset) {
180            if (stereoReset) {
181                mFlags |= FLAG_STEREO_RESET;
182            } else {
183                mFlags &= ~FLAG_STEREO_RESET;
184            }
185            return this;
186        }
187
188        /**
189         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
190         * By default, uses the first-strong heuristic.
191         *
192         * @param heuristic the {@code TextDirectionHeuristic} to use.
193         * @return the builder itself.
194         */
195        public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) {
196            mTextDirectionHeuristic = heuristic;
197            return this;
198        }
199
200        /**
201         * @return A BidiFormatter with the specified options.
202         */
203        public BidiFormatter build() {
204            if (mFlags == DEFAULT_FLAGS &&
205                    mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
206                return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext);
207            }
208            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic);
209        }
210    }
211
212    //
213    private static final int FLAG_STEREO_RESET = 2;
214    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
215
216    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
217            false /* LTR context */,
218            DEFAULT_FLAGS,
219            DEFAULT_TEXT_DIRECTION_HEURISTIC);
220
221    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
222            true /* RTL context */,
223            DEFAULT_FLAGS,
224            DEFAULT_TEXT_DIRECTION_HEURISTIC);
225
226    private final boolean mIsRtlContext;
227    private final int mFlags;
228    private final TextDirectionHeuristic mDefaultTextDirectionHeuristic;
229
230    /**
231     * Factory for creating an instance of BidiFormatter for the default locale directionality.
232     *
233     * This does not create any new objects, and returns already existing static instances.
234     *
235     */
236    public static BidiFormatter getInstance() {
237        return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault()));
238    }
239
240    /**
241     * Factory for creating an instance of BidiFormatter given the context directionality.
242     *
243     * This does not create any new objects, and returns already existing static instances.
244     *
245     * @param rtlContext Whether the context directionality is RTL.
246     */
247    public static BidiFormatter getInstance(boolean rtlContext) {
248        return getDefaultInstanceFromContext(rtlContext);
249    }
250
251    /**
252     * Factory for creating an instance of BidiFormatter given the context locale.
253     *
254     * This does not create any new objects, and returns already existing static instances.
255     *
256     * @param locale The context locale.
257     */
258    public static BidiFormatter getInstance(Locale locale) {
259        return getDefaultInstanceFromContext(isRtlLocale(locale));
260    }
261
262    /**
263     * @param isRtlContext Whether the context directionality is RTL or not.
264     * @param flags The option flags.
265     * @param heuristic The default text direction heuristic.
266     */
267    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) {
268        mIsRtlContext = isRtlContext;
269        mFlags = flags;
270        mDefaultTextDirectionHeuristic = heuristic;
271    }
272
273    /**
274     * @return Whether the context directionality is RTL
275     */
276    public boolean isRtlContext() {
277        return mIsRtlContext;
278    }
279
280    /**
281     * @return Whether directionality "reset" should also be done before a string being
282     * bidi-wrapped, not just after it.
283     */
284    public boolean getStereoReset() {
285        return (mFlags & FLAG_STEREO_RESET) != 0;
286    }
287
288    /**
289     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
290     * overall or the exit directionality of a given string is opposite to the context directionality.
291     * Putting this after the string (including its directionality declaration wrapping) prevents it
292     * from "sticking" to other opposite-directionality text or a number appearing after it inline
293     * with only neutral content in between. Otherwise returns the empty string. While the exit
294     * directionality is determined by scanning the end of the string, the overall directionality is
295     * given explicitly by a heuristic to estimate the {@code str}'s directionality.
296     *
297     * @param str CharSequence after which the mark may need to appear.
298     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
299     *                  directionality.
300     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
301     *     else, the empty string.
302     *
303     * @hide
304     */
305    public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) {
306        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
307        // getExitDir() is called only if needed (short-circuit).
308        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
309            return LRM_STRING;
310        }
311        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
312            return RLM_STRING;
313        }
314        return EMPTY_STRING;
315    }
316
317    /**
318     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
319     * overall or the entry directionality of a given string is opposite to the context
320     * directionality. Putting this before the string (including its directionality declaration
321     * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
322     * it inline with only neutral content in between. Otherwise returns the empty string. While the
323     * entry directionality is determined by scanning the beginning of the string, the overall
324     * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
325     *
326     * @param str CharSequence before which the mark may need to appear.
327     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
328     *                  directionality.
329     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
330     *     else, the empty string.
331     *
332     * @hide
333     */
334    public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) {
335        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
336        // getEntryDir() is called only if needed (short-circuit).
337        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
338            return LRM_STRING;
339        }
340        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
341            return RLM_STRING;
342        }
343        return EMPTY_STRING;
344    }
345
346    /**
347     * Estimates the directionality of a string using the default text direction heuristic.
348     *
349     * @param str String whose directionality is to be estimated.
350     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
351     *          false.
352     */
353    public boolean isRtl(String str) {
354        return isRtl((CharSequence) str);
355    }
356
357    /**
358     * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string
359     *
360     * @param str CharSequence whose directionality is to be estimated.
361     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
362     *          false.
363     */
364    public boolean isRtl(CharSequence str) {
365        return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length());
366    }
367
368    /**
369     * Formats a string of given directionality for use in plain-text output of the context
370     * directionality, so an opposite-directionality string is neither garbled nor garbles its
371     * surroundings. This makes use of Unicode bidi formatting characters.
372     * <p>
373     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
374     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
375     * LRE+{@code str}+PDF for LTR text.
376     * <p>
377     * If {@code isolate}, directionally isolates the string so that it does not garble its
378     * surroundings. Currently, this is done by "resetting" the directionality after the string by
379     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
380     * either the overall directionality or the exit directionality of the string is opposite to
381     * that of the context. Unless the formatter was built using
382     * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
383     * bidi mark matching the context directionality when either the overall directionality or the
384     * entry directionality of the string is opposite to that of the context. Note that as opposed
385     * to the overall directionality, the entry and exit directionalities are determined from the
386     * string itself.
387     * <p>
388     * Does *not* do HTML-escaping.
389     *
390     * @param str The input string.
391     * @param heuristic The algorithm to be used to estimate the string's overall direction.
392     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
393     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
394     *     content around it
395     * @return Input string after applying the above processing. {@code null} if {@code str} is
396     *     {@code null}.
397     */
398    public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic,
399            boolean isolate) {
400        if (str == null) return null;
401        return unicodeWrap((CharSequence) str, heuristic, isolate).toString();
402    }
403
404    /**
405     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a
406     * CharSequence instead of a string
407     *
408     * @param str The input CharSequence.
409     * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
410     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
411     * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
412     *     the content around it
413     * @return Input CharSequence after applying the above processing. {@code null} if {@code str}
414     *     is {@code null}.
415     */
416    public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str,
417            TextDirectionHeuristic heuristic, boolean isolate) {
418        if (str == null) return null;
419        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
420        SpannableStringBuilder result = new SpannableStringBuilder();
421        if (getStereoReset() && isolate) {
422            result.append(markBefore(str,
423                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
424        }
425        if (isRtl != mIsRtlContext) {
426            result.append(isRtl ? RLE : LRE);
427            result.append(str);
428            result.append(PDF);
429        } else {
430            result.append(str);
431        }
432        if (isolate) {
433            result.append(markAfter(str,
434                    isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
435        }
436        return result;
437    }
438
439    /**
440     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes
441     * {@code isolate} is true.
442     *
443     * @param str The input string.
444     * @param heuristic The algorithm to be used to estimate the string's overall direction.
445     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
446     * @return Input string after applying the above processing.
447     */
448    public String unicodeWrap(String str, TextDirectionHeuristic heuristic) {
449        return unicodeWrap(str, heuristic, true /* isolate */);
450    }
451
452    /**
453     * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but
454     * assumes {@code isolate} is true.
455     *
456     * @param str The input CharSequence.
457     * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
458     *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
459     * @return Input CharSequence after applying the above processing.
460     */
461    public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) {
462        return unicodeWrap(str, heuristic, true /* isolate */);
463    }
464
465
466    /**
467     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
468     * formatter's default direction estimation algorithm.
469     *
470     * @param str The input string.
471     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
472     *     content around it
473     * @return Input string after applying the above processing.
474     */
475    public String unicodeWrap(String str, boolean isolate) {
476        return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
477    }
478
479    /**
480     * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
481     * the formatter's default direction estimation algorithm.
482     *
483     * @param str The input CharSequence.
484     * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
485     *     the content around it
486     * @return Input CharSequence after applying the above processing.
487     */
488    public CharSequence unicodeWrap(CharSequence str, boolean isolate) {
489        return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
490    }
491
492    /**
493     * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
494     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
495     *
496     * @param str The input string.
497     * @return Input string after applying the above processing.
498     */
499    public String unicodeWrap(String str) {
500        return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
501    }
502
503    /**
504     * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
505     * the formatter's default direction estimation algorithm and assumes {@code isolate} is true.
506     *
507     * @param str The input CharSequence.
508     * @return Input CharSequence after applying the above processing.
509     */
510    public CharSequence unicodeWrap(CharSequence str) {
511        return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
512    }
513
514    private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
515        return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
516    }
517
518    /**
519     * Helper method to return true if the Locale directionality is RTL.
520     *
521     * @param locale The Locale whose directionality will be checked to be RTL or LTR
522     * @return true if the {@code locale} directionality is RTL. False otherwise.
523     */
524    private static boolean isRtlLocale(Locale locale) {
525        return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL);
526    }
527
528    /**
529     * Enum for directionality type.
530     */
531    private static final int DIR_LTR = -1;
532    private static final int DIR_UNKNOWN = 0;
533    private static final int DIR_RTL = +1;
534
535    /**
536     * Returns the directionality of the last character with strong directionality in the string, or
537     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
538     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
539     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
540     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
541     * whether a logically separate item that starts with a number or a character of the string's
542     * exit directionality and follows this string inline (not counting any neutral characters in
543     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
544     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
545     * between the two will prevent such sticking.
546     *
547     * @param str the string to check.
548     */
549    private static int getExitDir(CharSequence str) {
550        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
551    }
552
553    /**
554     * Returns the directionality of the first character with strong directionality in the string,
555     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
556     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
557     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
558     * characters. The intended use is to check whether a logically separate item that ends with a
559     * character of the string's entry directionality and precedes the string inline (not counting
560     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
561     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
562     * context's directionality) between the two will prevent such sticking.
563     *
564     * @param str the string to check.
565     */
566    private static int getEntryDir(CharSequence str) {
567        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
568    }
569
570    /**
571     * An object that estimates the directionality of a given string by various methods.
572     *
573     */
574    private static class DirectionalityEstimator {
575
576        // Internal static variables and constants.
577
578        /**
579         * Size of the bidi character class cache. The results of the Character.getDirectionality()
580         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
581         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
582         * cache. It can be reduced to 0x180, restricting the cache to the Western European
583         * languages.
584         */
585        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
586
587        /**
588         * The bidi character class cache.
589         */
590        private static final byte DIR_TYPE_CACHE[];
591
592        static {
593            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
594            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
595                // Calling Character.getDirectionality() is OK here, since new emojis start after
596                // the end of our cache.
597                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
598            }
599        }
600
601        private static byte getDirectionality(int codePoint) {
602            if (Emoji.isNewEmoji(codePoint)) {
603                // TODO: Fix or remove once emoji-data.text 5.0 is in ICU or update to 6.0.
604                return Character.DIRECTIONALITY_OTHER_NEUTRALS;
605            } else {
606                return Character.getDirectionality(codePoint);
607            }
608        }
609
610        // Internal instance variables.
611
612        /**
613         * The text to be scanned.
614         */
615        private final CharSequence text;
616
617        /**
618         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
619         * entities when looking for the next / preceding dir type.
620         */
621        private final boolean isHtml;
622
623        /**
624         * The length of the text in chars.
625         */
626        private final int length;
627
628        /**
629         * The current position in the text.
630         */
631        private int charIndex;
632
633        /**
634         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
635         * encountered a supplementary codepoint, this contains a char that is not a valid
636         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
637         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
638         */
639        private char lastChar;
640
641        /**
642         * Constructor.
643         *
644         * @param text The string to scan.
645         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
646         *     tags and entities.
647         */
648        DirectionalityEstimator(CharSequence text, boolean isHtml) {
649            this.text = text;
650            this.isHtml = isHtml;
651            length = text.length();
652        }
653
654        /**
655         * Returns the directionality of the first character with strong directionality in the
656         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
657         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
658         * after RLE/RLO. The results are undefined for a string containing unbalanced
659         * LRE/RLE/LRO/RLO/PDF characters.
660         */
661        int getEntryDir() {
662            // The reason for this method name, as opposed to getFirstStrongDir(), is that
663            // "first strong" is a commonly used description of Unicode's estimation algorithm,
664            // but the two must treat formatting characters quite differently. Thus, we are staying
665            // away from both "first" and "last" in these method names to avoid confusion.
666            charIndex = 0;
667            int embeddingLevel = 0;
668            int embeddingLevelDir = DIR_UNKNOWN;
669            int firstNonEmptyEmbeddingLevel = 0;
670            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
671                switch (dirTypeForward()) {
672                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
673                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
674                        ++embeddingLevel;
675                        embeddingLevelDir = DIR_LTR;
676                        break;
677                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
678                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
679                        ++embeddingLevel;
680                        embeddingLevelDir = DIR_RTL;
681                        break;
682                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
683                        --embeddingLevel;
684                        // To restore embeddingLevelDir to its previous value, we would need a
685                        // stack, which we want to avoid. Thus, at this point we do not know the
686                        // current embedding's directionality.
687                        embeddingLevelDir = DIR_UNKNOWN;
688                        break;
689                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
690                        break;
691                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
692                        if (embeddingLevel == 0) {
693                            return DIR_LTR;
694                        }
695                        firstNonEmptyEmbeddingLevel = embeddingLevel;
696                        break;
697                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
698                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
699                        if (embeddingLevel == 0) {
700                            return DIR_RTL;
701                        }
702                        firstNonEmptyEmbeddingLevel = embeddingLevel;
703                        break;
704                    default:
705                        firstNonEmptyEmbeddingLevel = embeddingLevel;
706                        break;
707                }
708            }
709
710            // We have either found a non-empty embedding or scanned the entire string finding
711            // neither a non-empty embedding nor a strong character outside of an embedding.
712            if (firstNonEmptyEmbeddingLevel == 0) {
713                // We have not found a non-empty embedding. Thus, the string contains neither a
714                // non-empty embedding nor a strong character outside of an embedding.
715                return DIR_UNKNOWN;
716            }
717
718            // We have found a non-empty embedding.
719            if (embeddingLevelDir != DIR_UNKNOWN) {
720                // We know the directionality of the non-empty embedding.
721                return embeddingLevelDir;
722            }
723
724            // We do not remember the directionality of the non-empty embedding we found. So, we go
725            // backwards to find the start of the non-empty embedding and get its directionality.
726            while (charIndex > 0) {
727                switch (dirTypeBackward()) {
728                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
729                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
730                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
731                            return DIR_LTR;
732                        }
733                        --embeddingLevel;
734                        break;
735                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
736                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
737                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
738                            return DIR_RTL;
739                        }
740                        --embeddingLevel;
741                        break;
742                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
743                        ++embeddingLevel;
744                        break;
745                }
746            }
747            // We should never get here.
748            return DIR_UNKNOWN;
749        }
750
751        /**
752         * Returns the directionality of the last character with strong directionality in the
753         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
754         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
755         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
756         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
757         */
758        int getExitDir() {
759            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
760            // strong" sounds like the exact opposite of "first strong", which is a commonly used
761            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
762            // must treat formatting characters quite differently. Thus, we are staying away from
763            // both "first" and "last" in these method names to avoid confusion.
764            charIndex = length;
765            int embeddingLevel = 0;
766            int lastNonEmptyEmbeddingLevel = 0;
767            while (charIndex > 0) {
768                switch (dirTypeBackward()) {
769                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
770                        if (embeddingLevel == 0) {
771                            return DIR_LTR;
772                        }
773                        if (lastNonEmptyEmbeddingLevel == 0) {
774                            lastNonEmptyEmbeddingLevel = embeddingLevel;
775                        }
776                        break;
777                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
778                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
779                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
780                            return DIR_LTR;
781                        }
782                        --embeddingLevel;
783                        break;
784                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
785                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
786                        if (embeddingLevel == 0) {
787                            return DIR_RTL;
788                        }
789                        if (lastNonEmptyEmbeddingLevel == 0) {
790                            lastNonEmptyEmbeddingLevel = embeddingLevel;
791                        }
792                        break;
793                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
794                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
795                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
796                            return DIR_RTL;
797                        }
798                        --embeddingLevel;
799                        break;
800                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
801                        ++embeddingLevel;
802                        break;
803                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
804                        break;
805                    default:
806                        if (lastNonEmptyEmbeddingLevel == 0) {
807                            lastNonEmptyEmbeddingLevel = embeddingLevel;
808                        }
809                        break;
810                }
811            }
812            return DIR_UNKNOWN;
813        }
814
815        // Internal methods
816
817        /**
818         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
819         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
820         * cache.
821         */
822        private static byte getCachedDirectionality(char c) {
823            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c);
824        }
825
826        /**
827         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
828         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
829         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
830         * figure out the actual character, and return its dirtype, but treating it as whitespace is
831         * good enough for our purposes.
832         *
833         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
834         */
835        byte dirTypeForward() {
836            lastChar = text.charAt(charIndex);
837            if (Character.isHighSurrogate(lastChar)) {
838                int codePoint = Character.codePointAt(text, charIndex);
839                charIndex += Character.charCount(codePoint);
840                return getDirectionality(codePoint);
841            }
842            charIndex++;
843            byte dirType = getCachedDirectionality(lastChar);
844            if (isHtml) {
845                // Process tags and entities.
846                if (lastChar == '<') {
847                    dirType = skipTagForward();
848                } else if (lastChar == '&') {
849                    dirType = skipEntityForward();
850                }
851            }
852            return dirType;
853        }
854
855        /**
856         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
857         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
858         * entity, advances over the whole tag/entity and returns
859         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
860         * actual character, and return its dirtype, but treating it as whitespace is good enough
861         * for our purposes.
862         *
863         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
864         */
865        byte dirTypeBackward() {
866            lastChar = text.charAt(charIndex - 1);
867            if (Character.isLowSurrogate(lastChar)) {
868                int codePoint = Character.codePointBefore(text, charIndex);
869                charIndex -= Character.charCount(codePoint);
870                return getDirectionality(codePoint);
871            }
872            charIndex--;
873            byte dirType = getCachedDirectionality(lastChar);
874            if (isHtml) {
875                // Process tags and entities.
876                if (lastChar == '>') {
877                    dirType = skipTagBackward();
878                } else if (lastChar == ';') {
879                    dirType = skipEntityBackward();
880                }
881            }
882            return dirType;
883        }
884
885        /**
886         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
887         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
888         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
889         * &lt; that hadn't been part of a tag after all).
890         */
891        private byte skipTagForward() {
892            int initialCharIndex = charIndex;
893            while (charIndex < length) {
894                lastChar = text.charAt(charIndex++);
895                if (lastChar == '>') {
896                    // The end of the tag.
897                    return Character.DIRECTIONALITY_WHITESPACE;
898                }
899                if (lastChar == '"' || lastChar == '\'') {
900                    // Skip over a quoted attribute value inside the tag.
901                    char quote = lastChar;
902                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
903                }
904            }
905            // The original '<' wasn't the start of a tag after all.
906            charIndex = initialCharIndex;
907            lastChar = '<';
908            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
909        }
910
911        /**
912         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
913         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
914         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
915         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
916         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
917         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
918         * when it encounters another &gt;.
919         */
920        private byte skipTagBackward() {
921            int initialCharIndex = charIndex;
922            while (charIndex > 0) {
923                lastChar = text.charAt(--charIndex);
924                if (lastChar == '<') {
925                    // The start of the tag.
926                    return Character.DIRECTIONALITY_WHITESPACE;
927                }
928                if (lastChar == '>') {
929                    break;
930                }
931                if (lastChar == '"' || lastChar == '\'') {
932                    // Skip over a quoted attribute value inside the tag.
933                    char quote = lastChar;
934                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
935                }
936            }
937            // The original '>' wasn't the end of a tag after all.
938            charIndex = initialCharIndex;
939            lastChar = '>';
940            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
941        }
942
943        /**
944         * Advances charIndex forward through an HTML character entity tag (after the opening
945         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
946         * best to figure out the actual character and return its dirtype, but this is good enough.
947         */
948        private byte skipEntityForward() {
949            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
950            return Character.DIRECTIONALITY_WHITESPACE;
951        }
952
953        /**
954         * Advances charIndex backward through an HTML character entity tag (after the closing ;
955         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
956         * to figure out the actual character and return its dirtype, but this is good enough.
957         * If there is no matching &amp;, does not change charIndex and returns
958         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
959         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
960         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
961         * also stops looking for a matching &amp; when it encounters another ;.
962         */
963        private byte skipEntityBackward() {
964            int initialCharIndex = charIndex;
965            while (charIndex > 0) {
966                lastChar = text.charAt(--charIndex);
967                if (lastChar == '&') {
968                    return Character.DIRECTIONALITY_WHITESPACE;
969                }
970                if (lastChar == ';') {
971                    break;
972                }
973            }
974            charIndex = initialCharIndex;
975            lastChar = ';';
976            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
977        }
978    }
979}
980