1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.support.v4.text;
18
19import android.support.v4.view.ViewCompat;
20import android.text.SpannableStringBuilder;
21
22import java.util.Locale;
23
24import static android.support.v4.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR;
25
26/**
27 * Utility class for formatting text for display in a potentially opposite-directionality context
28 * without garbling. The directionality of the context is set at formatter creation and the
29 * directionality of the text can be either estimated or passed in when known. Provides the
30 * following functionality:
31 * <p>
32 * 1. Bidi Wrapping
33 * When text in one language is mixed into a document in another, opposite-directionality language,
34 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string
35 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
36 * separated from the surrounding text in a "wrapper" that:
37 * <p>
38 * - Declares its directionality so that the string is displayed correctly. This can be done in
39 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
40 * <p>
41 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
42 *   Currently, this can only be done using invisible Unicode characters of the same direction as
43 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
44 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
45 *   string. Without "reset" after the string, the string will "stick" to a number or logically
46 *   separate opposite-direction text that happens to follow it in-line (even if separated by
47 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
48 *   happen there, but only with more opposite-direction text, not a number. One approach is to
49 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
50 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
51 *   the "reset" only before each string definitely does not work because we do not want to require
52 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
53 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
54 *   message translations often contain untranslated Latin-script brand names and technical terms,
55 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
56 *   has such a message, it is best to do the "reset" manually in the message translation itself,
57 *   since the message's opposite-direction text could be followed by an inserted number, which we
58 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
59 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
60 *   isolation to be part of the directionality declaration. This form of isolation is better than
61 *   "reset" because it takes less space, does not require knowing the context directionality, has a
62 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
63 *   using it because required platforms do not yet support it.
64 * <p>
65 * Providing these wrapping services is the basic purpose of the bidi formatter.
66 * <p>
67 * 2. Directionality estimation
68 * How does one know whether a string about to be inserted into surrounding text has the same
69 * directionality? Well, in many cases, one knows that this must be the case when writing the code
70 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
71 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
72 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
73 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
74 * language of the string (and thus its directionality) is not known a priori, and must be
75 * estimated at run-time. The bidi formatter can do this automatically using the default
76 * first-strong estimation algorithm. It can also be configured to use a custom directionality
77 * estimation object.
78 */
79public final class BidiFormatter {
80
81    /**
82     * The default text direction heuristic.
83     */
84    private static TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
85
86    /**
87     * Unicode "Left-To-Right Embedding" (LRE) character.
88     */
89    private static final char LRE = '\u202A';
90
91    /**
92     * Unicode "Right-To-Left Embedding" (RLE) character.
93     */
94    private static final char RLE = '\u202B';
95
96    /**
97     * Unicode "Pop Directional Formatting" (PDF) character.
98     */
99    private static final char PDF = '\u202C';
100
101    /**
102     *  Unicode "Left-To-Right Mark" (LRM) character.
103     */
104    private static final char LRM = '\u200E';
105
106    /*
107     * Unicode "Right-To-Left Mark" (RLM) character.
108     */
109    private static final char RLM = '\u200F';
110
111    /*
112     * String representation of LRM
113     */
114    private static final String LRM_STRING = Character.toString(LRM);
115
116    /*
117     * String representation of RLM
118     */
119    private static final String RLM_STRING = Character.toString(RLM);
120
121    /**
122     * Empty string constant.
123     */
124    private static final String EMPTY_STRING = "";
125
126    /**
127     * A class for building a BidiFormatter with non-default options.
128     */
129    public static final class Builder {
130        private boolean mIsRtlContext;
131        private int mFlags;
132        private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat;
133
134        /**
135         * Constructor.
136         *
137         */
138        public Builder() {
139            initialize(isRtlLocale(Locale.getDefault()));
140        }
141
142        /**
143         * Constructor.
144         *
145         * @param rtlContext Whether the context directionality is RTL.
146         */
147        public Builder(boolean rtlContext) {
148            initialize(rtlContext);
149        }
150
151        /**
152         * Constructor.
153         *
154         * @param locale The context locale.
155         */
156        public Builder(Locale locale) {
157            initialize(isRtlLocale(locale));
158        }
159
160        /**
161         * Initializes the builder with the given context directionality and default options.
162         *
163         * @param isRtlContext Whether the context is RTL or not.
164         */
165        private void initialize(boolean isRtlContext) {
166            mIsRtlContext = isRtlContext;
167            mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC;
168            mFlags = DEFAULT_FLAGS;
169        }
170
171        /**
172         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
173         * a string being bidi-wrapped, not just after it. The default is true.
174         */
175        public Builder stereoReset(boolean stereoReset) {
176            if (stereoReset) {
177                mFlags |= FLAG_STEREO_RESET;
178            } else {
179                mFlags &= ~FLAG_STEREO_RESET;
180            }
181            return this;
182        }
183
184        /**
185         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
186         * By default, uses the first-strong heuristic.
187         *
188         * @param heuristic the {@code TextDirectionHeuristic} to use.
189         * @return the builder itself.
190         */
191        public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) {
192            mTextDirectionHeuristicCompat = heuristic;
193            return this;
194        }
195
196        private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
197            return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
198        }
199
200        /**
201         * @return A BidiFormatter with the specified options.
202         */
203        public BidiFormatter build() {
204            if (mFlags == DEFAULT_FLAGS &&
205                    mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
206                return getDefaultInstanceFromContext(mIsRtlContext);
207            }
208            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat);
209        }
210    }
211
212    //
213    private static final int FLAG_STEREO_RESET = 2;
214    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
215
216    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
217            false /* LTR context */,
218            DEFAULT_FLAGS,
219            DEFAULT_TEXT_DIRECTION_HEURISTIC);
220
221    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
222            true /* RTL context */,
223            DEFAULT_FLAGS,
224            DEFAULT_TEXT_DIRECTION_HEURISTIC);
225
226    private final boolean mIsRtlContext;
227    private final int mFlags;
228    private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat;
229
230    /**
231     * Factory for creating an instance of BidiFormatter for the default locale directionality.
232     *
233     */
234    public static BidiFormatter getInstance() {
235        return new Builder().build();
236    }
237
238    /**
239     * Factory for creating an instance of BidiFormatter given the context directionality.
240     *
241     * @param rtlContext Whether the context directionality is RTL.
242     */
243    public static BidiFormatter getInstance(boolean rtlContext) {
244        return new Builder(rtlContext).build();
245    }
246
247    /**
248     * Factory for creating an instance of BidiFormatter given the context locale.
249     *
250     * @param locale The context locale.
251     */
252    public static BidiFormatter getInstance(Locale locale) {
253        return new Builder(locale).build();
254    }
255
256    /**
257     * @param isRtlContext Whether the context directionality is RTL or not.
258     * @param flags The option flags.
259     * @param heuristic The default text direction heuristic.
260     */
261    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) {
262        mIsRtlContext = isRtlContext;
263        mFlags = flags;
264        mDefaultTextDirectionHeuristicCompat = heuristic;
265    }
266
267    /**
268     * @return Whether the context directionality is RTL
269     */
270    public boolean isRtlContext() {
271        return mIsRtlContext;
272    }
273
274    /**
275     * @return Whether directionality "reset" should also be done before a string being
276     * bidi-wrapped, not just after it.
277     */
278    public boolean getStereoReset() {
279        return (mFlags & FLAG_STEREO_RESET) != 0;
280    }
281
282    /**
283     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
284     * overall or the exit directionality of a given CharSequence is opposite to the context
285     * directionality. Putting this after the CharSequence (including its directionality
286     * declaration wrapping) prevents it from "sticking" to other opposite-directionality text or a
287     * number appearing after it inline with only neutral content in between. Otherwise returns
288     * the empty string. While the exit directionality is determined by scanning the end of the
289     * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the
290     * {@code str}'s directionality.
291     *
292     * @param str CharSequence after which the mark may need to appear.
293     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
294     *                  directionality.
295     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
296     *     else, the empty .
297     */
298    private String markAfter(CharSequence str, TextDirectionHeuristicCompat heuristic) {
299        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
300        // getExitDir() is called only if needed (short-circuit).
301        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
302            return LRM_STRING;
303        }
304        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
305            return RLM_STRING;
306        }
307        return EMPTY_STRING;
308    }
309
310    /**
311     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
312     * overall or the entry directionality of a given CharSequence is opposite to the context
313     * directionality. Putting this before the CharSequence (including its directionality
314     * declaration wrapping) prevents it from "sticking" to other opposite-directionality text
315     * appearing before it inline with only neutral content in between. Otherwise returns the
316     * empty string. While the entry directionality is determined by scanning the beginning of the
317     * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the
318     * {@code str}'s directionality.
319     *
320     * @param str CharSequence before which the mark may need to appear.
321     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
322     *                  directionality.
323     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
324     *     else, the empty string.
325     */
326    private String markBefore(CharSequence str, TextDirectionHeuristicCompat heuristic) {
327        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
328        // getEntryDir() is called only if needed (short-circuit).
329        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
330            return LRM_STRING;
331        }
332        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
333            return RLM_STRING;
334        }
335        return EMPTY_STRING;
336    }
337
338    /**
339     * Estimates the directionality of a string using the default text direction heuristic.
340     *
341     * @param str String whose directionality is to be estimated.
342     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
343     *          false.
344     */
345    public boolean isRtl(String str) {
346        return isRtl((CharSequence) str);
347    }
348
349    /**
350     * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string.
351     *
352     * @param str CharSequence whose directionality is to be estimated.
353     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
354     *          false.
355     */
356    public boolean isRtl(CharSequence str) {
357        return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length());
358    }
359
360    /**
361     * Formats a string of given directionality for use in plain-text output of the context
362     * directionality, so an opposite-directionality string is neither garbled nor garbles its
363     * surroundings. This makes use of Unicode bidi formatting characters.
364     * <p>
365     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
366     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
367     * LRE+{@code str}+PDF for LTR text.
368     * <p>
369     * If {@code isolate}, directionally isolates the string so that it does not garble its
370     * surroundings. Currently, this is done by "resetting" the directionality after the string by
371     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
372     * either the overall directionality or the exit directionality of the string is opposite to
373     * that of the context. Unless the formatter was built using
374     * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
375     * bidi mark matching the context directionality when either the overall directionality or the
376     * entry directionality of the string is opposite to that of the context. Note that as opposed
377     * to the overall directionality, the entry and exit directionalities are determined from the
378     * string itself.
379     * <p>
380     * Does *not* do HTML-escaping.
381     *
382     * @param str The input string.
383     * @param heuristic The algorithm to be used to estimate the string's overall direction.
384     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
385     *     content around it
386     * @return Input string after applying the above processing. {@code null} if {@code str} is
387     *     {@code null}.
388     */
389    public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) {
390        if (str == null) return null;
391        return unicodeWrap((CharSequence) str, heuristic, isolate).toString();
392    }
393
394    /**
395     * Operates like {@link #unicodeWrap(String,
396     * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but takes a CharSequence
397     * instead of a string
398     *
399     * @param str The input CharSequence.
400     * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
401     *        See {@link android.support.v4.text.TextDirectionHeuristicsCompat} for pre-defined
402     *        heuristics.
403     * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
404     *     the content around it
405     * @return Input CharSequence after applying the above processing. {@code null} if {@code str}
406     *     is {@code null}.
407     */
408    public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic,
409            boolean isolate) {
410        if (str == null) return null;
411        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
412        SpannableStringBuilder result = new SpannableStringBuilder();
413        if (getStereoReset() && isolate) {
414            result.append(markBefore(str,
415                    isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
416        }
417        if (isRtl != mIsRtlContext) {
418            result.append(isRtl ? RLE : LRE);
419            result.append(str);
420            result.append(PDF);
421        } else {
422            result.append(str);
423        }
424        if (isolate) {
425            result.append(markAfter(str,
426                    isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
427        }
428        return result;
429    }
430
431    /**
432     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes
433     * {@code isolate} is true.
434     *
435     * @param str The input string.
436     * @param heuristic The algorithm to be used to estimate the string's overall direction.
437     * @return Input string after applying the above processing.
438     */
439    public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) {
440        return unicodeWrap(str, heuristic, true /* isolate */);
441    }
442
443    /**
444     * Operates like {@link #unicodeWrap(CharSequence,
445     * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes {@code isolate}
446     * is true.
447     *
448     * @param str The input CharSequence.
449     * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
450     *        See {@link android.support.v4.text.TextDirectionHeuristicsCompat} for pre-defined
451     *        heuristics.
452     * @return Input CharSequence after applying the above processing.
453     */
454    public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic) {
455        return unicodeWrap(str, heuristic, true /* isolate */);
456    }
457
458    /**
459     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the
460     * formatter's default direction estimation algorithm.
461     *
462     * @param str The input string.
463     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
464     *     content around it
465     * @return Input string after applying the above processing.
466     */
467    public String unicodeWrap(String str, boolean isolate) {
468        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate);
469    }
470
471    /**
472     * Operates like {@link #unicodeWrap(CharSequence,
473     * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's
474     * default direction estimation algorithm.
475     *
476     * @param str The input CharSequence.
477     * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
478     *     the content around it
479     * @return Input CharSequence after applying the above processing.
480     */
481    public CharSequence unicodeWrap(CharSequence str, boolean isolate) {
482        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate);
483    }
484
485    /**
486     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the
487     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
488     *
489     * @param str The input string.
490     * @return Input string after applying the above processing.
491     */
492    public String unicodeWrap(String str) {
493        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */);
494    }
495
496    /**
497     * Operates like {@link #unicodeWrap(CharSequence,
498     * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's
499     * default direction estimation algorithm and assumes {@code isolate} is true.
500     *
501     * @param str The input CharSequence.
502     * @return Input CharSequence after applying the above processing.
503     */
504    public CharSequence unicodeWrap(CharSequence str) {
505        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */);
506    }
507
508    /**
509     * Helper method to return true if the Locale directionality is RTL.
510     *
511     * @param locale The Locale whose directionality will be checked to be RTL or LTR
512     * @return true if the {@code locale} directionality is RTL. False otherwise.
513     */
514    private static boolean isRtlLocale(Locale locale) {
515        return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL);
516    }
517
518    /**
519     * Enum for directionality type.
520     */
521    private static final int DIR_LTR = -1;
522    private static final int DIR_UNKNOWN = 0;
523    private static final int DIR_RTL = +1;
524
525    /**
526     * Returns the directionality of the last character with strong directionality in the string, or
527     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
528     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
529     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
530     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
531     * whether a logically separate item that starts with a number or a character of the string's
532     * exit directionality and follows this string inline (not counting any neutral characters in
533     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
534     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
535     * between the two will prevent such sticking.
536     *
537     * @param str the string to check.
538     */
539    private static int getExitDir(CharSequence str) {
540        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
541    }
542
543    /**
544     * Returns the directionality of the first character with strong directionality in the string,
545     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
546     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
547     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
548     * characters. The intended use is to check whether a logically separate item that ends with a
549     * character of the string's entry directionality and precedes the string inline (not counting
550     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
551     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
552     * context's directionality) between the two will prevent such sticking.
553     *
554     * @param str the string to check.
555     */
556    private static int getEntryDir(CharSequence str) {
557        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
558    }
559
560    /**
561     * An object that estimates the directionality of a given string by various methods.
562     *
563     */
564    private static class DirectionalityEstimator {
565
566        // Internal static variables and constants.
567
568        /**
569         * Size of the bidi character class cache. The results of the Character.getDirectionality()
570         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
571         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
572         * cache. It can be reduced to 0x180, restricting the cache to the Western European
573         * languages.
574         */
575        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
576
577        /**
578         * The bidi character class cache.
579         */
580        private static final byte DIR_TYPE_CACHE[];
581
582        static {
583            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
584            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
585                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
586            }
587        }
588
589        // Internal instance variables.
590
591        /**
592         * The text to be scanned.
593         */
594        private final CharSequence text;
595
596        /**
597         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
598         * entities when looking for the next / preceding dir type.
599         */
600        private final boolean isHtml;
601
602        /**
603         * The length of the text in chars.
604         */
605        private final int length;
606
607        /**
608         * The current position in the text.
609         */
610        private int charIndex;
611
612        /**
613         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
614         * encountered a supplementary codepoint, this contains a char that is not a valid
615         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
616         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
617         */
618        private char lastChar;
619
620        /**
621         * Constructor.
622         *
623         * @param text The string to scan.
624         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
625         *     tags and entities.
626         */
627        DirectionalityEstimator(CharSequence text, boolean isHtml) {
628            this.text = text;
629            this.isHtml = isHtml;
630            length = text.length();
631        }
632
633        /**
634         * Returns the directionality of the first character with strong directionality in the
635         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
636         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
637         * after RLE/RLO. The results are undefined for a string containing unbalanced
638         * LRE/RLE/LRO/RLO/PDF characters.
639         */
640        int getEntryDir() {
641            // The reason for this method name, as opposed to getFirstStrongDir(), is that
642            // "first strong" is a commonly used description of Unicode's estimation algorithm,
643            // but the two must treat formatting characters quite differently. Thus, we are staying
644            // away from both "first" and "last" in these method names to avoid confusion.
645            charIndex = 0;
646            int embeddingLevel = 0;
647            int embeddingLevelDir = DIR_UNKNOWN;
648            int firstNonEmptyEmbeddingLevel = 0;
649            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
650                switch (dirTypeForward()) {
651                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
652                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
653                        ++embeddingLevel;
654                        embeddingLevelDir = DIR_LTR;
655                        break;
656                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
657                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
658                        ++embeddingLevel;
659                        embeddingLevelDir = DIR_RTL;
660                        break;
661                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
662                        --embeddingLevel;
663                        // To restore embeddingLevelDir to its previous value, we would need a
664                        // stack, which we want to avoid. Thus, at this point we do not know the
665                        // current embedding's directionality.
666                        embeddingLevelDir = DIR_UNKNOWN;
667                        break;
668                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
669                        break;
670                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
671                        if (embeddingLevel == 0) {
672                            return DIR_LTR;
673                        }
674                        firstNonEmptyEmbeddingLevel = embeddingLevel;
675                        break;
676                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
677                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
678                        if (embeddingLevel == 0) {
679                            return DIR_RTL;
680                        }
681                        firstNonEmptyEmbeddingLevel = embeddingLevel;
682                        break;
683                    default:
684                        firstNonEmptyEmbeddingLevel = embeddingLevel;
685                        break;
686                }
687            }
688
689            // We have either found a non-empty embedding or scanned the entire string finding
690            // neither a non-empty embedding nor a strong character outside of an embedding.
691            if (firstNonEmptyEmbeddingLevel == 0) {
692                // We have not found a non-empty embedding. Thus, the string contains neither a
693                // non-empty embedding nor a strong character outside of an embedding.
694                return DIR_UNKNOWN;
695            }
696
697            // We have found a non-empty embedding.
698            if (embeddingLevelDir != DIR_UNKNOWN) {
699                // We know the directionality of the non-empty embedding.
700                return embeddingLevelDir;
701            }
702
703            // We do not remember the directionality of the non-empty embedding we found. So, we go
704            // backwards to find the start of the non-empty embedding and get its directionality.
705            while (charIndex > 0) {
706                switch (dirTypeBackward()) {
707                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
708                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
709                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
710                            return DIR_LTR;
711                        }
712                        --embeddingLevel;
713                        break;
714                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
715                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
716                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
717                            return DIR_RTL;
718                        }
719                        --embeddingLevel;
720                        break;
721                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
722                        ++embeddingLevel;
723                        break;
724                }
725            }
726            // We should never get here.
727            return DIR_UNKNOWN;
728        }
729
730        /**
731         * Returns the directionality of the last character with strong directionality in the
732         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
733         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
734         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
735         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
736         */
737        int getExitDir() {
738            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
739            // strong" sounds like the exact opposite of "first strong", which is a commonly used
740            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
741            // must treat formatting characters quite differently. Thus, we are staying away from
742            // both "first" and "last" in these method names to avoid confusion.
743            charIndex = length;
744            int embeddingLevel = 0;
745            int lastNonEmptyEmbeddingLevel = 0;
746            while (charIndex > 0) {
747                switch (dirTypeBackward()) {
748                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
749                        if (embeddingLevel == 0) {
750                            return DIR_LTR;
751                        }
752                        if (lastNonEmptyEmbeddingLevel == 0) {
753                            lastNonEmptyEmbeddingLevel = embeddingLevel;
754                        }
755                        break;
756                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
757                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
758                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
759                            return DIR_LTR;
760                        }
761                        --embeddingLevel;
762                        break;
763                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
764                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
765                        if (embeddingLevel == 0) {
766                            return DIR_RTL;
767                        }
768                        if (lastNonEmptyEmbeddingLevel == 0) {
769                            lastNonEmptyEmbeddingLevel = embeddingLevel;
770                        }
771                        break;
772                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
773                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
774                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
775                            return DIR_RTL;
776                        }
777                        --embeddingLevel;
778                        break;
779                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
780                        ++embeddingLevel;
781                        break;
782                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
783                        break;
784                    default:
785                        if (lastNonEmptyEmbeddingLevel == 0) {
786                            lastNonEmptyEmbeddingLevel = embeddingLevel;
787                        }
788                        break;
789                }
790            }
791            return DIR_UNKNOWN;
792        }
793
794        // Internal methods
795
796        /**
797         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
798         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
799         * cache.
800         */
801        private static byte getCachedDirectionality(char c) {
802            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c);
803        }
804
805        /**
806         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
807         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
808         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
809         * figure out the actual character, and return its dirtype, but treating it as whitespace is
810         * good enough for our purposes.
811         *
812         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
813         */
814        byte dirTypeForward() {
815            lastChar = text.charAt(charIndex);
816            if (Character.isHighSurrogate(lastChar)) {
817                int codePoint = Character.codePointAt(text, charIndex);
818                charIndex += Character.charCount(codePoint);
819                return Character.getDirectionality(codePoint);
820            }
821            charIndex++;
822            byte dirType = getCachedDirectionality(lastChar);
823            if (isHtml) {
824                // Process tags and entities.
825                if (lastChar == '<') {
826                    dirType = skipTagForward();
827                } else if (lastChar == '&') {
828                    dirType = skipEntityForward();
829                }
830            }
831            return dirType;
832        }
833
834        /**
835         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
836         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
837         * entity, advances over the whole tag/entity and returns
838         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
839         * actual character, and return its dirtype, but treating it as whitespace is good enough
840         * for our purposes.
841         *
842         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
843         */
844        byte dirTypeBackward() {
845            lastChar = text.charAt(charIndex - 1);
846            if (Character.isLowSurrogate(lastChar)) {
847                int codePoint = Character.codePointBefore(text, charIndex);
848                charIndex -= Character.charCount(codePoint);
849                return Character.getDirectionality(codePoint);
850            }
851            charIndex--;
852            byte dirType = getCachedDirectionality(lastChar);
853            if (isHtml) {
854                // Process tags and entities.
855                if (lastChar == '>') {
856                    dirType = skipTagBackward();
857                } else if (lastChar == ';') {
858                    dirType = skipEntityBackward();
859                }
860            }
861            return dirType;
862        }
863
864        /**
865         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
866         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
867         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
868         * &lt; that hadn't been part of a tag after all).
869         */
870        private byte skipTagForward() {
871            int initialCharIndex = charIndex;
872            while (charIndex < length) {
873                lastChar = text.charAt(charIndex++);
874                if (lastChar == '>') {
875                    // The end of the tag.
876                    return Character.DIRECTIONALITY_WHITESPACE;
877                }
878                if (lastChar == '"' || lastChar == '\'') {
879                    // Skip over a quoted attribute value inside the tag.
880                    char quote = lastChar;
881                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
882                }
883            }
884            // The original '<' wasn't the start of a tag after all.
885            charIndex = initialCharIndex;
886            lastChar = '<';
887            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
888        }
889
890        /**
891         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
892         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
893         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
894         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
895         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
896         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
897         * when it encounters another &gt;.
898         */
899        private byte skipTagBackward() {
900            int initialCharIndex = charIndex;
901            while (charIndex > 0) {
902                lastChar = text.charAt(--charIndex);
903                if (lastChar == '<') {
904                    // The start of the tag.
905                    return Character.DIRECTIONALITY_WHITESPACE;
906                }
907                if (lastChar == '>') {
908                    break;
909                }
910                if (lastChar == '"' || lastChar == '\'') {
911                    // Skip over a quoted attribute value inside the tag.
912                    char quote = lastChar;
913                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
914                }
915            }
916            // The original '>' wasn't the end of a tag after all.
917            charIndex = initialCharIndex;
918            lastChar = '>';
919            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
920        }
921
922        /**
923         * Advances charIndex forward through an HTML character entity tag (after the opening
924         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
925         * best to figure out the actual character and return its dirtype, but this is good enough.
926         */
927        private byte skipEntityForward() {
928            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
929            return Character.DIRECTIONALITY_WHITESPACE;
930        }
931
932        /**
933         * Advances charIndex backward through an HTML character entity tag (after the closing ;
934         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
935         * to figure out the actual character and return its dirtype, but this is good enough.
936         * If there is no matching &amp;, does not change charIndex and returns
937         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
938         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
939         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
940         * also stops looking for a matching &amp; when it encounters another ;.
941         */
942        private byte skipEntityBackward() {
943            int initialCharIndex = charIndex;
944            while (charIndex > 0) {
945                lastChar = text.charAt(--charIndex);
946                if (lastChar == '&') {
947                    return Character.DIRECTIONALITY_WHITESPACE;
948                }
949                if (lastChar == ';') {
950                    break;
951                }
952            }
953            charIndex = initialCharIndex;
954            lastChar = ';';
955            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
956        }
957    }
958}