BidiFormatter.java revision c42363ad309d523d65fe8b66d16786a1d372805e
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.text; 18 19import android.view.View; 20 21import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR; 22 23import java.util.Locale; 24 25/** 26 * Utility class for formatting text for display in a potentially opposite-directionality context 27 * without garbling. The directionality of the context is set at formatter creation and the 28 * directionality of the text can be either estimated or passed in when known. 29 * 30 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2}, 31 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class. 32 * 33 * <p>These APIs provides the following functionality: 34 * <p> 35 * 1. Bidi Wrapping 36 * When text in one language is mixed into a document in another, opposite-directionality language, 37 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string 38 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 39 * separated from the surrounding text in a "wrapper" that: 40 * <p> 41 * - Declares its directionality so that the string is displayed correctly. This can be done in 42 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 43 * <p> 44 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 45 * Currently, this can only be done using invisible Unicode characters of the same direction as 46 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 47 * the directionality to that of the context. The "reset" may need to be done at both ends of the 48 * string. Without "reset" after the string, the string will "stick" to a number or logically 49 * separate opposite-direction text that happens to follow it in-line (even if separated by 50 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 51 * happen there, but only with more opposite-direction text, not a number. One approach is to 52 * "reset" the direction only after each string, on the theory that if the preceding opposite- 53 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 54 * the "reset" only before each string definitely does not work because we do not want to require 55 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 56 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 57 * message translations often contain untranslated Latin-script brand names and technical terms, 58 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 59 * has such a message, it is best to do the "reset" manually in the message translation itself, 60 * since the message's opposite-direction text could be followed by an inserted number, which we 61 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 62 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 63 * isolation to be part of the directionality declaration. This form of isolation is better than 64 * "reset" because it takes less space, does not require knowing the context directionality, has a 65 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 66 * using it because required platforms do not yet support it. 67 * <p> 68 * Providing these wrapping services is the basic purpose of the bidi formatter. 69 * <p> 70 * 2. Directionality estimation 71 * How does one know whether a string about to be inserted into surrounding text has the same 72 * directionality? Well, in many cases, one knows that this must be the case when writing the code 73 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 74 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 75 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 76 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 77 * language of the string (and thus its directionality) is not known a priori, and must be 78 * estimated at run-time. The bidi formatter can do this automatically using the default 79 * first-strong estimation algorithm. It can also be configured to use a custom directionality 80 * estimation object. 81 */ 82public final class BidiFormatter { 83 84 /** 85 * The default text direction heuristic. 86 */ 87 private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 88 89 /** 90 * Unicode "Left-To-Right Embedding" (LRE) character. 91 */ 92 private static final char LRE = '\u202A'; 93 94 /** 95 * Unicode "Right-To-Left Embedding" (RLE) character. 96 */ 97 private static final char RLE = '\u202B'; 98 99 /** 100 * Unicode "Pop Directional Formatting" (PDF) character. 101 */ 102 private static final char PDF = '\u202C'; 103 104 /** 105 * Unicode "Left-To-Right Mark" (LRM) character. 106 */ 107 private static final char LRM = '\u200E'; 108 109 /* 110 * Unicode "Right-To-Left Mark" (RLM) character. 111 */ 112 private static final char RLM = '\u200F'; 113 114 /* 115 * String representation of LRM 116 */ 117 private static final String LRM_STRING = Character.toString(LRM); 118 119 /* 120 * String representation of RLM 121 */ 122 private static final String RLM_STRING = Character.toString(RLM); 123 124 /** 125 * Empty string constant. 126 */ 127 private static final String EMPTY_STRING = ""; 128 129 /** 130 * A class for building a BidiFormatter with non-default options. 131 */ 132 public static final class Builder { 133 private boolean mIsRtlContext; 134 private int mFlags; 135 private TextDirectionHeuristic mTextDirectionHeuristic; 136 137 /** 138 * Constructor. 139 * 140 */ 141 public Builder() { 142 initialize(isRtlLocale(Locale.getDefault())); 143 } 144 145 /** 146 * Constructor. 147 * 148 * @param rtlContext Whether the context directionality is RTL. 149 */ 150 public Builder(boolean rtlContext) { 151 initialize(rtlContext); 152 } 153 154 /** 155 * Constructor. 156 * 157 * @param locale The context locale. 158 */ 159 public Builder(Locale locale) { 160 initialize(isRtlLocale(locale)); 161 } 162 163 /** 164 * Initializes the builder with the given context directionality and default options. 165 * 166 * @param isRtlContext Whether the context is RTL or not. 167 */ 168 private void initialize(boolean isRtlContext) { 169 mIsRtlContext = isRtlContext; 170 mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC; 171 mFlags = DEFAULT_FLAGS; 172 } 173 174 /** 175 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 176 * a string being bidi-wrapped, not just after it. The default is true. 177 */ 178 public Builder stereoReset(boolean stereoReset) { 179 if (stereoReset) { 180 mFlags |= FLAG_STEREO_RESET; 181 } else { 182 mFlags &= ~FLAG_STEREO_RESET; 183 } 184 return this; 185 } 186 187 /** 188 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 189 * By default, uses the first-strong heuristic. 190 * 191 * @param heuristic the {@code TextDirectionHeuristic} to use. 192 * @return the builder itself. 193 */ 194 public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) { 195 mTextDirectionHeuristic = heuristic; 196 return this; 197 } 198 199 /** 200 * @return A BidiFormatter with the specified options. 201 */ 202 public BidiFormatter build() { 203 if (mFlags == DEFAULT_FLAGS && 204 mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 205 return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext); 206 } 207 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic); 208 } 209 } 210 211 // 212 private static final int FLAG_STEREO_RESET = 2; 213 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 214 215 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 216 false /* LTR context */, 217 DEFAULT_FLAGS, 218 DEFAULT_TEXT_DIRECTION_HEURISTIC); 219 220 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 221 true /* RTL context */, 222 DEFAULT_FLAGS, 223 DEFAULT_TEXT_DIRECTION_HEURISTIC); 224 225 private final boolean mIsRtlContext; 226 private final int mFlags; 227 private final TextDirectionHeuristic mDefaultTextDirectionHeuristic; 228 229 /** 230 * Factory for creating an instance of BidiFormatter for the default locale directionality. 231 * 232 * This does not create any new objects, and returns already existing static instances. 233 * 234 */ 235 public static BidiFormatter getInstance() { 236 return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault())); 237 } 238 239 /** 240 * Factory for creating an instance of BidiFormatter given the context directionality. 241 * 242 * This does not create any new objects, and returns already existing static instances. 243 * 244 * @param rtlContext Whether the context directionality is RTL. 245 */ 246 public static BidiFormatter getInstance(boolean rtlContext) { 247 return getDefaultInstanceFromContext(rtlContext); 248 } 249 250 /** 251 * Factory for creating an instance of BidiFormatter given the context locale. 252 * 253 * This does not create any new objects, and returns already existing static instances. 254 * 255 * @param locale The context locale. 256 */ 257 public static BidiFormatter getInstance(Locale locale) { 258 return getDefaultInstanceFromContext(isRtlLocale(locale)); 259 } 260 261 /** 262 * @param isRtlContext Whether the context directionality is RTL or not. 263 * @param flags The option flags. 264 * @param heuristic The default text direction heuristic. 265 */ 266 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) { 267 mIsRtlContext = isRtlContext; 268 mFlags = flags; 269 mDefaultTextDirectionHeuristic = heuristic; 270 } 271 272 /** 273 * @return Whether the context directionality is RTL 274 */ 275 public boolean isRtlContext() { 276 return mIsRtlContext; 277 } 278 279 /** 280 * @return Whether directionality "reset" should also be done before a string being 281 * bidi-wrapped, not just after it. 282 */ 283 public boolean getStereoReset() { 284 return (mFlags & FLAG_STEREO_RESET) != 0; 285 } 286 287 /** 288 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 289 * overall or the exit directionality of a given string is opposite to the context directionality. 290 * Putting this after the string (including its directionality declaration wrapping) prevents it 291 * from "sticking" to other opposite-directionality text or a number appearing after it inline 292 * with only neutral content in between. Otherwise returns the empty string. While the exit 293 * directionality is determined by scanning the end of the string, the overall directionality is 294 * given explicitly by a heuristic to estimate the {@code str}'s directionality. 295 * 296 * @param str String after which the mark may need to appear. 297 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 298 * directionality. 299 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 300 * else, the empty string. 301 * 302 * @hide 303 */ 304 public String markAfter(String str, TextDirectionHeuristic heuristic) { 305 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 306 // getExitDir() is called only if needed (short-circuit). 307 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 308 return LRM_STRING; 309 } 310 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 311 return RLM_STRING; 312 } 313 return EMPTY_STRING; 314 } 315 316 /** 317 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 318 * overall or the entry directionality of a given string is opposite to the context 319 * directionality. Putting this before the string (including its directionality declaration 320 * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before 321 * it inline with only neutral content in between. Otherwise returns the empty string. While the 322 * entry directionality is determined by scanning the beginning of the string, the overall 323 * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. 324 * 325 * @param str String before which the mark may need to appear. 326 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 327 * directionality. 328 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 329 * else, the empty string. 330 * 331 * @hide 332 */ 333 public String markBefore(String str, TextDirectionHeuristic heuristic) { 334 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 335 // getEntryDir() is called only if needed (short-circuit). 336 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 337 return LRM_STRING; 338 } 339 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 340 return RLM_STRING; 341 } 342 return EMPTY_STRING; 343 } 344 345 /** 346 * Estimates the directionality of a string using the default text direction heuristic. 347 * 348 * @param str String whose directionality is to be estimated. 349 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 350 * false. 351 */ 352 public boolean isRtl(String str) { 353 return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length()); 354 } 355 356 /** 357 * Formats a string of given directionality for use in plain-text output of the context 358 * directionality, so an opposite-directionality string is neither garbled nor garbles its 359 * surroundings. This makes use of Unicode bidi formatting characters. 360 * <p> 361 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 362 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 363 * LRE+{@code str}+PDF for LTR text. 364 * <p> 365 * If {@code isolate}, directionally isolates the string so that it does not garble its 366 * surroundings. Currently, this is done by "resetting" the directionality after the string by 367 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 368 * either the overall directionality or the exit directionality of the string is opposite to 369 * that of the context. Unless the formatter was built using 370 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 371 * bidi mark matching the context directionality when either the overall directionality or the 372 * entry directionality of the string is opposite to that of the context. Note that as opposed 373 * to the overall directionality, the entry and exit directionalities are determined from the 374 * string itself. 375 * <p> 376 * Does *not* do HTML-escaping. 377 * 378 * @param str The input string. 379 * @param heuristic The algorithm to be used to estimate the string's overall direction. 380 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 381 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 382 * content around it 383 * @return Input string after applying the above processing. {@code null} if {@code str} is 384 * {@code null}. 385 */ 386 public String unicodeWrap(String str, TextDirectionHeuristic heuristic, boolean isolate) { 387 if (str == null) return null; 388 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 389 StringBuilder result = new StringBuilder(); 390 if (getStereoReset() && isolate) { 391 result.append(markBefore(str, 392 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 393 } 394 if (isRtl != mIsRtlContext) { 395 result.append(isRtl ? RLE : LRE); 396 result.append(str); 397 result.append(PDF); 398 } else { 399 result.append(str); 400 } 401 if (isolate) { 402 result.append(markAfter(str, 403 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 404 } 405 return result.toString(); 406 } 407 408 /** 409 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes 410 * {@code isolate} is true. 411 * 412 * @param str The input string. 413 * @param heuristic The algorithm to be used to estimate the string's overall direction. 414 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 415 * @return Input string after applying the above processing. 416 */ 417 public String unicodeWrap(String str, TextDirectionHeuristic heuristic) { 418 return unicodeWrap(str, heuristic, true /* isolate */); 419 } 420 421 /** 422 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 423 * formatter's default direction estimation algorithm. 424 * 425 * @param str The input string. 426 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 427 * content around it 428 * @return Input string after applying the above processing. 429 */ 430 public String unicodeWrap(String str, boolean isolate) { 431 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 432 } 433 434 /** 435 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 436 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 437 * 438 * @param str The input string. 439 * @return Input string after applying the above processing. 440 */ 441 public String unicodeWrap(String str) { 442 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 443 } 444 445 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 446 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 447 } 448 449 /** 450 * Helper method to return true if the Locale directionality is RTL. 451 * 452 * @param locale The Locale whose directionality will be checked to be RTL or LTR 453 * @return true if the {@code locale} directionality is RTL. False otherwise. 454 */ 455 private static boolean isRtlLocale(Locale locale) { 456 return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL); 457 } 458 459 /** 460 * Enum for directionality type. 461 */ 462 private static final int DIR_LTR = -1; 463 private static final int DIR_UNKNOWN = 0; 464 private static final int DIR_RTL = +1; 465 466 /** 467 * Returns the directionality of the last character with strong directionality in the string, or 468 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 469 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 470 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 471 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 472 * whether a logically separate item that starts with a number or a character of the string's 473 * exit directionality and follows this string inline (not counting any neutral characters in 474 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 475 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 476 * between the two will prevent such sticking. 477 * 478 * @param str the string to check. 479 */ 480 private static int getExitDir(String str) { 481 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 482 } 483 484 /** 485 * Returns the directionality of the first character with strong directionality in the string, 486 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 487 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 488 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 489 * characters. The intended use is to check whether a logically separate item that ends with a 490 * character of the string's entry directionality and precedes the string inline (not counting 491 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 492 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 493 * context's directionality) between the two will prevent such sticking. 494 * 495 * @param str the string to check. 496 */ 497 private static int getEntryDir(String str) { 498 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 499 } 500 501 /** 502 * An object that estimates the directionality of a given string by various methods. 503 * 504 */ 505 private static class DirectionalityEstimator { 506 507 // Internal static variables and constants. 508 509 /** 510 * Size of the bidi character class cache. The results of the Character.getDirectionality() 511 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 512 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 513 * cache. It can be reduced to 0x180, restricting the cache to the Western European 514 * languages. 515 */ 516 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 517 518 /** 519 * The bidi character class cache. 520 */ 521 private static final byte DIR_TYPE_CACHE[]; 522 523 static { 524 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 525 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 526 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 527 } 528 } 529 530 // Internal instance variables. 531 532 /** 533 * The text to be scanned. 534 */ 535 private final String text; 536 537 /** 538 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 539 * entities when looking for the next / preceding dir type. 540 */ 541 private final boolean isHtml; 542 543 /** 544 * The length of the text in chars. 545 */ 546 private final int length; 547 548 /** 549 * The current position in the text. 550 */ 551 private int charIndex; 552 553 /** 554 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 555 * encountered a supplementary codepoint, this contains a char that is not a valid 556 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 557 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 558 */ 559 private char lastChar; 560 561 /** 562 * Constructor. 563 * 564 * @param text The string to scan. 565 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 566 * tags and entities. 567 */ 568 DirectionalityEstimator(String text, boolean isHtml) { 569 this.text = text; 570 this.isHtml = isHtml; 571 length = text.length(); 572 } 573 574 /** 575 * Returns the directionality of the first character with strong directionality in the 576 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 577 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 578 * after RLE/RLO. The results are undefined for a string containing unbalanced 579 * LRE/RLE/LRO/RLO/PDF characters. 580 */ 581 int getEntryDir() { 582 // The reason for this method name, as opposed to getFirstStrongDir(), is that 583 // "first strong" is a commonly used description of Unicode's estimation algorithm, 584 // but the two must treat formatting characters quite differently. Thus, we are staying 585 // away from both "first" and "last" in these method names to avoid confusion. 586 charIndex = 0; 587 int embeddingLevel = 0; 588 int embeddingLevelDir = DIR_UNKNOWN; 589 int firstNonEmptyEmbeddingLevel = 0; 590 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 591 switch (dirTypeForward()) { 592 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 593 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 594 ++embeddingLevel; 595 embeddingLevelDir = DIR_LTR; 596 break; 597 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 598 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 599 ++embeddingLevel; 600 embeddingLevelDir = DIR_RTL; 601 break; 602 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 603 --embeddingLevel; 604 // To restore embeddingLevelDir to its previous value, we would need a 605 // stack, which we want to avoid. Thus, at this point we do not know the 606 // current embedding's directionality. 607 embeddingLevelDir = DIR_UNKNOWN; 608 break; 609 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 610 break; 611 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 612 if (embeddingLevel == 0) { 613 return DIR_LTR; 614 } 615 firstNonEmptyEmbeddingLevel = embeddingLevel; 616 break; 617 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 618 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 619 if (embeddingLevel == 0) { 620 return DIR_RTL; 621 } 622 firstNonEmptyEmbeddingLevel = embeddingLevel; 623 break; 624 default: 625 firstNonEmptyEmbeddingLevel = embeddingLevel; 626 break; 627 } 628 } 629 630 // We have either found a non-empty embedding or scanned the entire string finding 631 // neither a non-empty embedding nor a strong character outside of an embedding. 632 if (firstNonEmptyEmbeddingLevel == 0) { 633 // We have not found a non-empty embedding. Thus, the string contains neither a 634 // non-empty embedding nor a strong character outside of an embedding. 635 return DIR_UNKNOWN; 636 } 637 638 // We have found a non-empty embedding. 639 if (embeddingLevelDir != DIR_UNKNOWN) { 640 // We know the directionality of the non-empty embedding. 641 return embeddingLevelDir; 642 } 643 644 // We do not remember the directionality of the non-empty embedding we found. So, we go 645 // backwards to find the start of the non-empty embedding and get its directionality. 646 while (charIndex > 0) { 647 switch (dirTypeBackward()) { 648 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 649 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 650 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 651 return DIR_LTR; 652 } 653 --embeddingLevel; 654 break; 655 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 656 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 657 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 658 return DIR_RTL; 659 } 660 --embeddingLevel; 661 break; 662 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 663 ++embeddingLevel; 664 break; 665 } 666 } 667 // We should never get here. 668 return DIR_UNKNOWN; 669 } 670 671 /** 672 * Returns the directionality of the last character with strong directionality in the 673 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 674 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 675 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 676 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 677 */ 678 int getExitDir() { 679 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 680 // strong" sounds like the exact opposite of "first strong", which is a commonly used 681 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 682 // must treat formatting characters quite differently. Thus, we are staying away from 683 // both "first" and "last" in these method names to avoid confusion. 684 charIndex = length; 685 int embeddingLevel = 0; 686 int lastNonEmptyEmbeddingLevel = 0; 687 while (charIndex > 0) { 688 switch (dirTypeBackward()) { 689 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 690 if (embeddingLevel == 0) { 691 return DIR_LTR; 692 } 693 if (lastNonEmptyEmbeddingLevel == 0) { 694 lastNonEmptyEmbeddingLevel = embeddingLevel; 695 } 696 break; 697 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 698 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 699 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 700 return DIR_LTR; 701 } 702 --embeddingLevel; 703 break; 704 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 705 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 706 if (embeddingLevel == 0) { 707 return DIR_RTL; 708 } 709 if (lastNonEmptyEmbeddingLevel == 0) { 710 lastNonEmptyEmbeddingLevel = embeddingLevel; 711 } 712 break; 713 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 714 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 715 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 716 return DIR_RTL; 717 } 718 --embeddingLevel; 719 break; 720 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 721 ++embeddingLevel; 722 break; 723 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 724 break; 725 default: 726 if (lastNonEmptyEmbeddingLevel == 0) { 727 lastNonEmptyEmbeddingLevel = embeddingLevel; 728 } 729 break; 730 } 731 } 732 return DIR_UNKNOWN; 733 } 734 735 // Internal methods 736 737 /** 738 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 739 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 740 * cache. 741 */ 742 private static byte getCachedDirectionality(char c) { 743 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c); 744 } 745 746 /** 747 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 748 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 749 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 750 * figure out the actual character, and return its dirtype, but treating it as whitespace is 751 * good enough for our purposes. 752 * 753 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 754 */ 755 byte dirTypeForward() { 756 lastChar = text.charAt(charIndex); 757 if (Character.isHighSurrogate(lastChar)) { 758 int codePoint = Character.codePointAt(text, charIndex); 759 charIndex += Character.charCount(codePoint); 760 return Character.getDirectionality(codePoint); 761 } 762 charIndex++; 763 byte dirType = getCachedDirectionality(lastChar); 764 if (isHtml) { 765 // Process tags and entities. 766 if (lastChar == '<') { 767 dirType = skipTagForward(); 768 } else if (lastChar == '&') { 769 dirType = skipEntityForward(); 770 } 771 } 772 return dirType; 773 } 774 775 /** 776 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 777 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 778 * entity, advances over the whole tag/entity and returns 779 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 780 * actual character, and return its dirtype, but treating it as whitespace is good enough 781 * for our purposes. 782 * 783 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 784 */ 785 byte dirTypeBackward() { 786 lastChar = text.charAt(charIndex - 1); 787 if (Character.isLowSurrogate(lastChar)) { 788 int codePoint = Character.codePointBefore(text, charIndex); 789 charIndex -= Character.charCount(codePoint); 790 return Character.getDirectionality(codePoint); 791 } 792 charIndex--; 793 byte dirType = getCachedDirectionality(lastChar); 794 if (isHtml) { 795 // Process tags and entities. 796 if (lastChar == '>') { 797 dirType = skipTagBackward(); 798 } else if (lastChar == ';') { 799 dirType = skipEntityBackward(); 800 } 801 } 802 return dirType; 803 } 804 805 /** 806 * Advances charIndex forward through an HTML tag (after the opening < has already been 807 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 808 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 809 * < that hadn't been part of a tag after all). 810 */ 811 private byte skipTagForward() { 812 int initialCharIndex = charIndex; 813 while (charIndex < length) { 814 lastChar = text.charAt(charIndex++); 815 if (lastChar == '>') { 816 // The end of the tag. 817 return Character.DIRECTIONALITY_WHITESPACE; 818 } 819 if (lastChar == '"' || lastChar == '\'') { 820 // Skip over a quoted attribute value inside the tag. 821 char quote = lastChar; 822 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 823 } 824 } 825 // The original '<' wasn't the start of a tag after all. 826 charIndex = initialCharIndex; 827 lastChar = '<'; 828 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 829 } 830 831 /** 832 * Advances charIndex backward through an HTML tag (after the closing > has already been 833 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 834 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 835 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 836 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 837 * ">>>>", because skipTagBackward() also stops looking for a matching < 838 * when it encounters another >. 839 */ 840 private byte skipTagBackward() { 841 int initialCharIndex = charIndex; 842 while (charIndex > 0) { 843 lastChar = text.charAt(--charIndex); 844 if (lastChar == '<') { 845 // The start of the tag. 846 return Character.DIRECTIONALITY_WHITESPACE; 847 } 848 if (lastChar == '>') { 849 break; 850 } 851 if (lastChar == '"' || lastChar == '\'') { 852 // Skip over a quoted attribute value inside the tag. 853 char quote = lastChar; 854 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 855 } 856 } 857 // The original '>' wasn't the end of a tag after all. 858 charIndex = initialCharIndex; 859 lastChar = '>'; 860 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 861 } 862 863 /** 864 * Advances charIndex forward through an HTML character entity tag (after the opening 865 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 866 * best to figure out the actual character and return its dirtype, but this is good enough. 867 */ 868 private byte skipEntityForward() { 869 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 870 return Character.DIRECTIONALITY_WHITESPACE; 871 } 872 873 /** 874 * Advances charIndex backward through an HTML character entity tag (after the closing ; 875 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 876 * to figure out the actual character and return its dirtype, but this is good enough. 877 * If there is no matching &, does not change charIndex and returns 878 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 879 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 880 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 881 * also stops looking for a matching & when it encounters another ;. 882 */ 883 private byte skipEntityBackward() { 884 int initialCharIndex = charIndex; 885 while (charIndex > 0) { 886 lastChar = text.charAt(--charIndex); 887 if (lastChar == '&') { 888 return Character.DIRECTIONALITY_WHITESPACE; 889 } 890 if (lastChar == ';') { 891 break; 892 } 893 } 894 charIndex = initialCharIndex; 895 lastChar = ';'; 896 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 897 } 898 } 899}