1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.text; 18 19import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR; 20 21import android.annotation.Nullable; 22import android.view.View; 23 24import com.android.internal.annotations.VisibleForTesting; 25 26import java.util.Locale; 27 28/** 29 * Utility class for formatting text for display in a potentially opposite-directionality context 30 * without garbling. The directionality of the context is set at formatter creation and the 31 * directionality of the text can be either estimated or passed in when known. 32 * 33 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2}, 34 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class. 35 * 36 * <p>These APIs provides the following functionality: 37 * <p> 38 * 1. Bidi Wrapping 39 * When text in one language is mixed into a document in another, opposite-directionality language, 40 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string 41 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 42 * separated from the surrounding text in a "wrapper" that: 43 * <p> 44 * - Declares its directionality so that the string is displayed correctly. This can be done in 45 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 46 * <p> 47 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 48 * Currently, this can only be done using invisible Unicode characters of the same direction as 49 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 50 * the directionality to that of the context. The "reset" may need to be done at both ends of the 51 * string. Without "reset" after the string, the string will "stick" to a number or logically 52 * separate opposite-direction text that happens to follow it in-line (even if separated by 53 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 54 * happen there, but only with more opposite-direction text, not a number. One approach is to 55 * "reset" the direction only after each string, on the theory that if the preceding opposite- 56 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 57 * the "reset" only before each string definitely does not work because we do not want to require 58 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 59 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 60 * message translations often contain untranslated Latin-script brand names and technical terms, 61 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 62 * has such a message, it is best to do the "reset" manually in the message translation itself, 63 * since the message's opposite-direction text could be followed by an inserted number, which we 64 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 65 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 66 * isolation to be part of the directionality declaration. This form of isolation is better than 67 * "reset" because it takes less space, does not require knowing the context directionality, has a 68 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 69 * using it because required platforms do not yet support it. 70 * <p> 71 * Providing these wrapping services is the basic purpose of the bidi formatter. 72 * <p> 73 * 2. Directionality estimation 74 * How does one know whether a string about to be inserted into surrounding text has the same 75 * directionality? Well, in many cases, one knows that this must be the case when writing the code 76 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 77 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 78 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 79 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 80 * language of the string (and thus its directionality) is not known a priori, and must be 81 * estimated at run-time. The bidi formatter can do this automatically using the default 82 * first-strong estimation algorithm. It can also be configured to use a custom directionality 83 * estimation object. 84 */ 85public final class BidiFormatter { 86 87 /** 88 * The default text direction heuristic. 89 */ 90 private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 91 92 /** 93 * Unicode "Left-To-Right Embedding" (LRE) character. 94 */ 95 private static final char LRE = '\u202A'; 96 97 /** 98 * Unicode "Right-To-Left Embedding" (RLE) character. 99 */ 100 private static final char RLE = '\u202B'; 101 102 /** 103 * Unicode "Pop Directional Formatting" (PDF) character. 104 */ 105 private static final char PDF = '\u202C'; 106 107 /** 108 * Unicode "Left-To-Right Mark" (LRM) character. 109 */ 110 private static final char LRM = '\u200E'; 111 112 /* 113 * Unicode "Right-To-Left Mark" (RLM) character. 114 */ 115 private static final char RLM = '\u200F'; 116 117 /* 118 * String representation of LRM 119 */ 120 private static final String LRM_STRING = Character.toString(LRM); 121 122 /* 123 * String representation of RLM 124 */ 125 private static final String RLM_STRING = Character.toString(RLM); 126 127 /** 128 * Empty string constant. 129 */ 130 private static final String EMPTY_STRING = ""; 131 132 /** 133 * A class for building a BidiFormatter with non-default options. 134 */ 135 public static final class Builder { 136 private boolean mIsRtlContext; 137 private int mFlags; 138 private TextDirectionHeuristic mTextDirectionHeuristic; 139 140 /** 141 * Constructor. 142 * 143 */ 144 public Builder() { 145 initialize(isRtlLocale(Locale.getDefault())); 146 } 147 148 /** 149 * Constructor. 150 * 151 * @param rtlContext Whether the context directionality is RTL. 152 */ 153 public Builder(boolean rtlContext) { 154 initialize(rtlContext); 155 } 156 157 /** 158 * Constructor. 159 * 160 * @param locale The context locale. 161 */ 162 public Builder(Locale locale) { 163 initialize(isRtlLocale(locale)); 164 } 165 166 /** 167 * Initializes the builder with the given context directionality and default options. 168 * 169 * @param isRtlContext Whether the context is RTL or not. 170 */ 171 private void initialize(boolean isRtlContext) { 172 mIsRtlContext = isRtlContext; 173 mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC; 174 mFlags = DEFAULT_FLAGS; 175 } 176 177 /** 178 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 179 * a string being bidi-wrapped, not just after it. The default is true. 180 */ 181 public Builder stereoReset(boolean stereoReset) { 182 if (stereoReset) { 183 mFlags |= FLAG_STEREO_RESET; 184 } else { 185 mFlags &= ~FLAG_STEREO_RESET; 186 } 187 return this; 188 } 189 190 /** 191 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 192 * By default, uses the first-strong heuristic. 193 * 194 * @param heuristic the {@code TextDirectionHeuristic} to use. 195 * @return the builder itself. 196 */ 197 public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) { 198 mTextDirectionHeuristic = heuristic; 199 return this; 200 } 201 202 /** 203 * @return A BidiFormatter with the specified options. 204 */ 205 public BidiFormatter build() { 206 if (mFlags == DEFAULT_FLAGS && 207 mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 208 return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext); 209 } 210 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic); 211 } 212 } 213 214 // 215 private static final int FLAG_STEREO_RESET = 2; 216 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 217 218 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 219 false /* LTR context */, 220 DEFAULT_FLAGS, 221 DEFAULT_TEXT_DIRECTION_HEURISTIC); 222 223 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 224 true /* RTL context */, 225 DEFAULT_FLAGS, 226 DEFAULT_TEXT_DIRECTION_HEURISTIC); 227 228 private final boolean mIsRtlContext; 229 private final int mFlags; 230 private final TextDirectionHeuristic mDefaultTextDirectionHeuristic; 231 232 /** 233 * Factory for creating an instance of BidiFormatter for the default locale directionality. 234 * 235 * This does not create any new objects, and returns already existing static instances. 236 * 237 */ 238 public static BidiFormatter getInstance() { 239 return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault())); 240 } 241 242 /** 243 * Factory for creating an instance of BidiFormatter given the context directionality. 244 * 245 * This does not create any new objects, and returns already existing static instances. 246 * 247 * @param rtlContext Whether the context directionality is RTL. 248 */ 249 public static BidiFormatter getInstance(boolean rtlContext) { 250 return getDefaultInstanceFromContext(rtlContext); 251 } 252 253 /** 254 * Factory for creating an instance of BidiFormatter given the context locale. 255 * 256 * This does not create any new objects, and returns already existing static instances. 257 * 258 * @param locale The context locale. 259 */ 260 public static BidiFormatter getInstance(Locale locale) { 261 return getDefaultInstanceFromContext(isRtlLocale(locale)); 262 } 263 264 /** 265 * @param isRtlContext Whether the context directionality is RTL or not. 266 * @param flags The option flags. 267 * @param heuristic The default text direction heuristic. 268 */ 269 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) { 270 mIsRtlContext = isRtlContext; 271 mFlags = flags; 272 mDefaultTextDirectionHeuristic = heuristic; 273 } 274 275 /** 276 * @return Whether the context directionality is RTL 277 */ 278 public boolean isRtlContext() { 279 return mIsRtlContext; 280 } 281 282 /** 283 * @return Whether directionality "reset" should also be done before a string being 284 * bidi-wrapped, not just after it. 285 */ 286 public boolean getStereoReset() { 287 return (mFlags & FLAG_STEREO_RESET) != 0; 288 } 289 290 /** 291 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 292 * overall or the exit directionality of a given string is opposite to the context directionality. 293 * Putting this after the string (including its directionality declaration wrapping) prevents it 294 * from "sticking" to other opposite-directionality text or a number appearing after it inline 295 * with only neutral content in between. Otherwise returns the empty string. While the exit 296 * directionality is determined by scanning the end of the string, the overall directionality is 297 * given explicitly by a heuristic to estimate the {@code str}'s directionality. 298 * 299 * @param str CharSequence after which the mark may need to appear. 300 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 301 * directionality. 302 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 303 * else, the empty string. 304 * 305 * @hide 306 */ 307 public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) { 308 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 309 // getExitDir() is called only if needed (short-circuit). 310 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 311 return LRM_STRING; 312 } 313 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 314 return RLM_STRING; 315 } 316 return EMPTY_STRING; 317 } 318 319 /** 320 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 321 * overall or the entry directionality of a given string is opposite to the context 322 * directionality. Putting this before the string (including its directionality declaration 323 * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before 324 * it inline with only neutral content in between. Otherwise returns the empty string. While the 325 * entry directionality is determined by scanning the beginning of the string, the overall 326 * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. 327 * 328 * @param str CharSequence before which the mark may need to appear. 329 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 330 * directionality. 331 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 332 * else, the empty string. 333 * 334 * @hide 335 */ 336 public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) { 337 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 338 // getEntryDir() is called only if needed (short-circuit). 339 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 340 return LRM_STRING; 341 } 342 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 343 return RLM_STRING; 344 } 345 return EMPTY_STRING; 346 } 347 348 /** 349 * Estimates the directionality of a string using the default text direction heuristic. 350 * 351 * @param str String whose directionality is to be estimated. 352 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 353 * false. 354 */ 355 public boolean isRtl(String str) { 356 return isRtl((CharSequence) str); 357 } 358 359 /** 360 * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string 361 * 362 * @param str CharSequence whose directionality is to be estimated. 363 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 364 * false. 365 */ 366 public boolean isRtl(CharSequence str) { 367 return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length()); 368 } 369 370 /** 371 * Formats a string of given directionality for use in plain-text output of the context 372 * directionality, so an opposite-directionality string is neither garbled nor garbles its 373 * surroundings. This makes use of Unicode bidi formatting characters. 374 * <p> 375 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 376 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 377 * LRE+{@code str}+PDF for LTR text. 378 * <p> 379 * If {@code isolate}, directionally isolates the string so that it does not garble its 380 * surroundings. Currently, this is done by "resetting" the directionality after the string by 381 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 382 * either the overall directionality or the exit directionality of the string is opposite to 383 * that of the context. Unless the formatter was built using 384 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 385 * bidi mark matching the context directionality when either the overall directionality or the 386 * entry directionality of the string is opposite to that of the context. Note that as opposed 387 * to the overall directionality, the entry and exit directionalities are determined from the 388 * string itself. 389 * <p> 390 * Does *not* do HTML-escaping. 391 * 392 * @param str The input string. 393 * @param heuristic The algorithm to be used to estimate the string's overall direction. 394 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 395 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 396 * content around it 397 * @return Input string after applying the above processing. {@code null} if {@code str} is 398 * {@code null}. 399 */ 400 public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic, 401 boolean isolate) { 402 if (str == null) return null; 403 return unicodeWrap((CharSequence) str, heuristic, isolate).toString(); 404 } 405 406 /** 407 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a 408 * CharSequence instead of a string 409 * 410 * @param str The input CharSequence. 411 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 412 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 413 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 414 * the content around it 415 * @return Input CharSequence after applying the above processing. {@code null} if {@code str} 416 * is {@code null}. 417 */ 418 public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str, 419 TextDirectionHeuristic heuristic, boolean isolate) { 420 if (str == null) return null; 421 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 422 SpannableStringBuilder result = new SpannableStringBuilder(); 423 if (getStereoReset() && isolate) { 424 result.append(markBefore(str, 425 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 426 } 427 if (isRtl != mIsRtlContext) { 428 result.append(isRtl ? RLE : LRE); 429 result.append(str); 430 result.append(PDF); 431 } else { 432 result.append(str); 433 } 434 if (isolate) { 435 result.append(markAfter(str, 436 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 437 } 438 return result; 439 } 440 441 /** 442 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes 443 * {@code isolate} is true. 444 * 445 * @param str The input string. 446 * @param heuristic The algorithm to be used to estimate the string's overall direction. 447 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 448 * @return Input string after applying the above processing. 449 */ 450 public String unicodeWrap(String str, TextDirectionHeuristic heuristic) { 451 return unicodeWrap(str, heuristic, true /* isolate */); 452 } 453 454 /** 455 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but 456 * assumes {@code isolate} is true. 457 * 458 * @param str The input CharSequence. 459 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 460 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 461 * @return Input CharSequence after applying the above processing. 462 */ 463 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) { 464 return unicodeWrap(str, heuristic, true /* isolate */); 465 } 466 467 468 /** 469 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 470 * formatter's default direction estimation algorithm. 471 * 472 * @param str The input string. 473 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 474 * content around it 475 * @return Input string after applying the above processing. 476 */ 477 public String unicodeWrap(String str, boolean isolate) { 478 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 479 } 480 481 /** 482 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses 483 * the formatter's default direction estimation algorithm. 484 * 485 * @param str The input CharSequence. 486 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 487 * the content around it 488 * @return Input CharSequence after applying the above processing. 489 */ 490 public CharSequence unicodeWrap(CharSequence str, boolean isolate) { 491 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 492 } 493 494 /** 495 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 496 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 497 * 498 * @param str The input string. 499 * @return Input string after applying the above processing. 500 */ 501 public String unicodeWrap(String str) { 502 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 503 } 504 505 /** 506 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses 507 * the formatter's default direction estimation algorithm and assumes {@code isolate} is true. 508 * 509 * @param str The input CharSequence. 510 * @return Input CharSequence after applying the above processing. 511 */ 512 public CharSequence unicodeWrap(CharSequence str) { 513 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 514 } 515 516 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 517 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 518 } 519 520 /** 521 * Helper method to return true if the Locale directionality is RTL. 522 * 523 * @param locale The Locale whose directionality will be checked to be RTL or LTR 524 * @return true if the {@code locale} directionality is RTL. False otherwise. 525 */ 526 private static boolean isRtlLocale(Locale locale) { 527 return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL); 528 } 529 530 /** 531 * Enum for directionality type. 532 */ 533 private static final int DIR_LTR = -1; 534 private static final int DIR_UNKNOWN = 0; 535 private static final int DIR_RTL = +1; 536 537 /** 538 * Returns the directionality of the last character with strong directionality in the string, or 539 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 540 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 541 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 542 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 543 * whether a logically separate item that starts with a number or a character of the string's 544 * exit directionality and follows this string inline (not counting any neutral characters in 545 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 546 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 547 * between the two will prevent such sticking. 548 * 549 * @param str the string to check. 550 */ 551 private static int getExitDir(CharSequence str) { 552 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 553 } 554 555 /** 556 * Returns the directionality of the first character with strong directionality in the string, 557 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 558 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 559 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 560 * characters. The intended use is to check whether a logically separate item that ends with a 561 * character of the string's entry directionality and precedes the string inline (not counting 562 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 563 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 564 * context's directionality) between the two will prevent such sticking. 565 * 566 * @param str the string to check. 567 */ 568 private static int getEntryDir(CharSequence str) { 569 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 570 } 571 572 /** 573 * An object that estimates the directionality of a given string by various methods. 574 * 575 * @hide 576 */ 577 @VisibleForTesting 578 public static class DirectionalityEstimator { 579 580 // Internal static variables and constants. 581 582 /** 583 * Size of the bidi character class cache. The results of the Character.getDirectionality() 584 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 585 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 586 * cache. It can be reduced to 0x180, restricting the cache to the Western European 587 * languages. 588 */ 589 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 590 591 /** 592 * The bidi character class cache. 593 */ 594 private static final byte DIR_TYPE_CACHE[]; 595 596 static { 597 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 598 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 599 // Calling Character.getDirectionality() is OK here, since new emojis start after 600 // the end of our cache. 601 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 602 } 603 } 604 605 /** 606 * Return Character directionality. Same as {@link Character#getDirectionality(int)} except 607 * it overrides values for newest emoji that are not covered by ICU. 608 */ 609 public static byte getDirectionality(int codePoint) { 610 if (Emoji.isNewEmoji(codePoint)) { 611 // TODO: Fix or remove once emoji-data.text 5.0 is in ICU or update to 6.0. 612 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 613 } else { 614 return Character.getDirectionality(codePoint); 615 } 616 } 617 618 // Internal instance variables. 619 620 /** 621 * The text to be scanned. 622 */ 623 private final CharSequence text; 624 625 /** 626 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 627 * entities when looking for the next / preceding dir type. 628 */ 629 private final boolean isHtml; 630 631 /** 632 * The length of the text in chars. 633 */ 634 private final int length; 635 636 /** 637 * The current position in the text. 638 */ 639 private int charIndex; 640 641 /** 642 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 643 * encountered a supplementary codepoint, this contains a char that is not a valid 644 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 645 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 646 */ 647 private char lastChar; 648 649 /** 650 * Constructor. 651 * 652 * @param text The string to scan. 653 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 654 * tags and entities. 655 */ 656 DirectionalityEstimator(CharSequence text, boolean isHtml) { 657 this.text = text; 658 this.isHtml = isHtml; 659 length = text.length(); 660 } 661 662 /** 663 * Returns the directionality of the first character with strong directionality in the 664 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 665 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 666 * after RLE/RLO. The results are undefined for a string containing unbalanced 667 * LRE/RLE/LRO/RLO/PDF characters. 668 */ 669 int getEntryDir() { 670 // The reason for this method name, as opposed to getFirstStrongDir(), is that 671 // "first strong" is a commonly used description of Unicode's estimation algorithm, 672 // but the two must treat formatting characters quite differently. Thus, we are staying 673 // away from both "first" and "last" in these method names to avoid confusion. 674 charIndex = 0; 675 int embeddingLevel = 0; 676 int embeddingLevelDir = DIR_UNKNOWN; 677 int firstNonEmptyEmbeddingLevel = 0; 678 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 679 switch (dirTypeForward()) { 680 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 681 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 682 ++embeddingLevel; 683 embeddingLevelDir = DIR_LTR; 684 break; 685 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 686 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 687 ++embeddingLevel; 688 embeddingLevelDir = DIR_RTL; 689 break; 690 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 691 --embeddingLevel; 692 // To restore embeddingLevelDir to its previous value, we would need a 693 // stack, which we want to avoid. Thus, at this point we do not know the 694 // current embedding's directionality. 695 embeddingLevelDir = DIR_UNKNOWN; 696 break; 697 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 698 break; 699 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 700 if (embeddingLevel == 0) { 701 return DIR_LTR; 702 } 703 firstNonEmptyEmbeddingLevel = embeddingLevel; 704 break; 705 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 706 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 707 if (embeddingLevel == 0) { 708 return DIR_RTL; 709 } 710 firstNonEmptyEmbeddingLevel = embeddingLevel; 711 break; 712 default: 713 firstNonEmptyEmbeddingLevel = embeddingLevel; 714 break; 715 } 716 } 717 718 // We have either found a non-empty embedding or scanned the entire string finding 719 // neither a non-empty embedding nor a strong character outside of an embedding. 720 if (firstNonEmptyEmbeddingLevel == 0) { 721 // We have not found a non-empty embedding. Thus, the string contains neither a 722 // non-empty embedding nor a strong character outside of an embedding. 723 return DIR_UNKNOWN; 724 } 725 726 // We have found a non-empty embedding. 727 if (embeddingLevelDir != DIR_UNKNOWN) { 728 // We know the directionality of the non-empty embedding. 729 return embeddingLevelDir; 730 } 731 732 // We do not remember the directionality of the non-empty embedding we found. So, we go 733 // backwards to find the start of the non-empty embedding and get its directionality. 734 while (charIndex > 0) { 735 switch (dirTypeBackward()) { 736 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 737 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 738 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 739 return DIR_LTR; 740 } 741 --embeddingLevel; 742 break; 743 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 744 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 745 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 746 return DIR_RTL; 747 } 748 --embeddingLevel; 749 break; 750 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 751 ++embeddingLevel; 752 break; 753 } 754 } 755 // We should never get here. 756 return DIR_UNKNOWN; 757 } 758 759 /** 760 * Returns the directionality of the last character with strong directionality in the 761 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 762 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 763 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 764 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 765 */ 766 int getExitDir() { 767 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 768 // strong" sounds like the exact opposite of "first strong", which is a commonly used 769 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 770 // must treat formatting characters quite differently. Thus, we are staying away from 771 // both "first" and "last" in these method names to avoid confusion. 772 charIndex = length; 773 int embeddingLevel = 0; 774 int lastNonEmptyEmbeddingLevel = 0; 775 while (charIndex > 0) { 776 switch (dirTypeBackward()) { 777 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 778 if (embeddingLevel == 0) { 779 return DIR_LTR; 780 } 781 if (lastNonEmptyEmbeddingLevel == 0) { 782 lastNonEmptyEmbeddingLevel = embeddingLevel; 783 } 784 break; 785 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 786 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 787 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 788 return DIR_LTR; 789 } 790 --embeddingLevel; 791 break; 792 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 793 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 794 if (embeddingLevel == 0) { 795 return DIR_RTL; 796 } 797 if (lastNonEmptyEmbeddingLevel == 0) { 798 lastNonEmptyEmbeddingLevel = embeddingLevel; 799 } 800 break; 801 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 802 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 803 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 804 return DIR_RTL; 805 } 806 --embeddingLevel; 807 break; 808 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 809 ++embeddingLevel; 810 break; 811 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 812 break; 813 default: 814 if (lastNonEmptyEmbeddingLevel == 0) { 815 lastNonEmptyEmbeddingLevel = embeddingLevel; 816 } 817 break; 818 } 819 } 820 return DIR_UNKNOWN; 821 } 822 823 // Internal methods 824 825 /** 826 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 827 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 828 * cache. 829 */ 830 private static byte getCachedDirectionality(char c) { 831 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c); 832 } 833 834 /** 835 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 836 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 837 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 838 * figure out the actual character, and return its dirtype, but treating it as whitespace is 839 * good enough for our purposes. 840 * 841 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 842 */ 843 byte dirTypeForward() { 844 lastChar = text.charAt(charIndex); 845 if (Character.isHighSurrogate(lastChar)) { 846 int codePoint = Character.codePointAt(text, charIndex); 847 charIndex += Character.charCount(codePoint); 848 return getDirectionality(codePoint); 849 } 850 charIndex++; 851 byte dirType = getCachedDirectionality(lastChar); 852 if (isHtml) { 853 // Process tags and entities. 854 if (lastChar == '<') { 855 dirType = skipTagForward(); 856 } else if (lastChar == '&') { 857 dirType = skipEntityForward(); 858 } 859 } 860 return dirType; 861 } 862 863 /** 864 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 865 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 866 * entity, advances over the whole tag/entity and returns 867 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 868 * actual character, and return its dirtype, but treating it as whitespace is good enough 869 * for our purposes. 870 * 871 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 872 */ 873 byte dirTypeBackward() { 874 lastChar = text.charAt(charIndex - 1); 875 if (Character.isLowSurrogate(lastChar)) { 876 int codePoint = Character.codePointBefore(text, charIndex); 877 charIndex -= Character.charCount(codePoint); 878 return getDirectionality(codePoint); 879 } 880 charIndex--; 881 byte dirType = getCachedDirectionality(lastChar); 882 if (isHtml) { 883 // Process tags and entities. 884 if (lastChar == '>') { 885 dirType = skipTagBackward(); 886 } else if (lastChar == ';') { 887 dirType = skipEntityBackward(); 888 } 889 } 890 return dirType; 891 } 892 893 /** 894 * Advances charIndex forward through an HTML tag (after the opening < has already been 895 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 896 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 897 * < that hadn't been part of a tag after all). 898 */ 899 private byte skipTagForward() { 900 int initialCharIndex = charIndex; 901 while (charIndex < length) { 902 lastChar = text.charAt(charIndex++); 903 if (lastChar == '>') { 904 // The end of the tag. 905 return Character.DIRECTIONALITY_WHITESPACE; 906 } 907 if (lastChar == '"' || lastChar == '\'') { 908 // Skip over a quoted attribute value inside the tag. 909 char quote = lastChar; 910 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 911 } 912 } 913 // The original '<' wasn't the start of a tag after all. 914 charIndex = initialCharIndex; 915 lastChar = '<'; 916 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 917 } 918 919 /** 920 * Advances charIndex backward through an HTML tag (after the closing > has already been 921 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 922 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 923 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 924 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 925 * ">>>>", because skipTagBackward() also stops looking for a matching < 926 * when it encounters another >. 927 */ 928 private byte skipTagBackward() { 929 int initialCharIndex = charIndex; 930 while (charIndex > 0) { 931 lastChar = text.charAt(--charIndex); 932 if (lastChar == '<') { 933 // The start of the tag. 934 return Character.DIRECTIONALITY_WHITESPACE; 935 } 936 if (lastChar == '>') { 937 break; 938 } 939 if (lastChar == '"' || lastChar == '\'') { 940 // Skip over a quoted attribute value inside the tag. 941 char quote = lastChar; 942 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 943 } 944 } 945 // The original '>' wasn't the end of a tag after all. 946 charIndex = initialCharIndex; 947 lastChar = '>'; 948 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 949 } 950 951 /** 952 * Advances charIndex forward through an HTML character entity tag (after the opening 953 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 954 * best to figure out the actual character and return its dirtype, but this is good enough. 955 */ 956 private byte skipEntityForward() { 957 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 958 return Character.DIRECTIONALITY_WHITESPACE; 959 } 960 961 /** 962 * Advances charIndex backward through an HTML character entity tag (after the closing ; 963 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 964 * to figure out the actual character and return its dirtype, but this is good enough. 965 * If there is no matching &, does not change charIndex and returns 966 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 967 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 968 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 969 * also stops looking for a matching & when it encounters another ;. 970 */ 971 private byte skipEntityBackward() { 972 int initialCharIndex = charIndex; 973 while (charIndex > 0) { 974 lastChar = text.charAt(--charIndex); 975 if (lastChar == '&') { 976 return Character.DIRECTIONALITY_WHITESPACE; 977 } 978 if (lastChar == ';') { 979 break; 980 } 981 } 982 charIndex = initialCharIndex; 983 lastChar = ';'; 984 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 985 } 986 } 987} 988