1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.support.v4.text; 18 19import android.support.v4.view.ViewCompat; 20 21import java.util.Locale; 22 23import static android.support.v4.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR; 24 25/** 26 * Utility class for formatting text for display in a potentially opposite-directionality context 27 * without garbling. The directionality of the context is set at formatter creation and the 28 * directionality of the text can be either estimated or passed in when known. Provides the 29 * following functionality: 30 * <p> 31 * 1. Bidi Wrapping 32 * When text in one language is mixed into a document in another, opposite-directionality language, 33 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string 34 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 35 * separated from the surrounding text in a "wrapper" that: 36 * <p> 37 * - Declares its directionality so that the string is displayed correctly. This can be done in 38 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 39 * <p> 40 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 41 * Currently, this can only be done using invisible Unicode characters of the same direction as 42 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 43 * the directionality to that of the context. The "reset" may need to be done at both ends of the 44 * string. Without "reset" after the string, the string will "stick" to a number or logically 45 * separate opposite-direction text that happens to follow it in-line (even if separated by 46 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 47 * happen there, but only with more opposite-direction text, not a number. One approach is to 48 * "reset" the direction only after each string, on the theory that if the preceding opposite- 49 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 50 * the "reset" only before each string definitely does not work because we do not want to require 51 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 52 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 53 * message translations often contain untranslated Latin-script brand names and technical terms, 54 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 55 * has such a message, it is best to do the "reset" manually in the message translation itself, 56 * since the message's opposite-direction text could be followed by an inserted number, which we 57 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 58 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 59 * isolation to be part of the directionality declaration. This form of isolation is better than 60 * "reset" because it takes less space, does not require knowing the context directionality, has a 61 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 62 * using it because required platforms do not yet support it. 63 * <p> 64 * Providing these wrapping services is the basic purpose of the bidi formatter. 65 * <p> 66 * 2. Directionality estimation 67 * How does one know whether a string about to be inserted into surrounding text has the same 68 * directionality? Well, in many cases, one knows that this must be the case when writing the code 69 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 70 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 71 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 72 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 73 * language of the string (and thus its directionality) is not known a priori, and must be 74 * estimated at run-time. The bidi formatter can do this automatically using the default 75 * first-strong estimation algorithm. It can also be configured to use a custom directionality 76 * estimation object. 77 */ 78public final class BidiFormatter { 79 80 /** 81 * The default text direction heuristic. 82 */ 83 private static TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 84 85 /** 86 * Unicode "Left-To-Right Embedding" (LRE) character. 87 */ 88 private static final char LRE = '\u202A'; 89 90 /** 91 * Unicode "Right-To-Left Embedding" (RLE) character. 92 */ 93 private static final char RLE = '\u202B'; 94 95 /** 96 * Unicode "Pop Directional Formatting" (PDF) character. 97 */ 98 private static final char PDF = '\u202C'; 99 100 /** 101 * Unicode "Left-To-Right Mark" (LRM) character. 102 */ 103 private static final char LRM = '\u200E'; 104 105 /* 106 * Unicode "Right-To-Left Mark" (RLM) character. 107 */ 108 private static final char RLM = '\u200F'; 109 110 /* 111 * String representation of LRM 112 */ 113 private static final String LRM_STRING = Character.toString(LRM); 114 115 /* 116 * String representation of RLM 117 */ 118 private static final String RLM_STRING = Character.toString(RLM); 119 120 /** 121 * Empty string constant. 122 */ 123 private static final String EMPTY_STRING = ""; 124 125 /** 126 * A class for building a BidiFormatter with non-default options. 127 */ 128 public static final class Builder { 129 private boolean mIsRtlContext; 130 private int mFlags; 131 private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat; 132 133 /** 134 * Constructor. 135 * 136 */ 137 public Builder() { 138 initialize(isRtlLocale(Locale.getDefault())); 139 } 140 141 /** 142 * Constructor. 143 * 144 * @param rtlContext Whether the context directionality is RTL. 145 */ 146 public Builder(boolean rtlContext) { 147 initialize(rtlContext); 148 } 149 150 /** 151 * Constructor. 152 * 153 * @param locale The context locale. 154 */ 155 public Builder(Locale locale) { 156 initialize(isRtlLocale(locale)); 157 } 158 159 /** 160 * Initializes the builder with the given context directionality and default options. 161 * 162 * @param isRtlContext Whether the context is RTL or not. 163 */ 164 private void initialize(boolean isRtlContext) { 165 mIsRtlContext = isRtlContext; 166 mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC; 167 mFlags = DEFAULT_FLAGS; 168 } 169 170 /** 171 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 172 * a string being bidi-wrapped, not just after it. The default is false. 173 */ 174 public Builder stereoReset(boolean stereoReset) { 175 if (stereoReset) { 176 mFlags |= FLAG_STEREO_RESET; 177 } else { 178 mFlags &= ~FLAG_STEREO_RESET; 179 } 180 return this; 181 } 182 183 /** 184 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 185 * By default, uses the first-strong heuristic. 186 * 187 * @param heuristic the {@code TextDirectionHeuristic} to use. 188 * @return the builder itself. 189 */ 190 public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) { 191 mTextDirectionHeuristicCompat = heuristic; 192 return this; 193 } 194 195 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 196 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 197 } 198 199 /** 200 * @return A BidiFormatter with the specified options. 201 */ 202 public BidiFormatter build() { 203 if (mFlags == DEFAULT_FLAGS && 204 mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 205 return getDefaultInstanceFromContext(mIsRtlContext); 206 } 207 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat); 208 } 209 } 210 211 // 212 private static final int FLAG_STEREO_RESET = 2; 213 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 214 215 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 216 false /* LTR context */, 217 DEFAULT_FLAGS, 218 DEFAULT_TEXT_DIRECTION_HEURISTIC); 219 220 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 221 true /* RTL context */, 222 DEFAULT_FLAGS, 223 DEFAULT_TEXT_DIRECTION_HEURISTIC); 224 225 private final boolean mIsRtlContext; 226 private final int mFlags; 227 private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat; 228 229 /** 230 * Factory for creating an instance of BidiFormatter for the default locale directionality. 231 * 232 */ 233 public static BidiFormatter getInstance() { 234 return new Builder().build(); 235 } 236 237 /** 238 * Factory for creating an instance of BidiFormatter given the context directionality. 239 * 240 * @param rtlContext Whether the context directionality is RTL. 241 */ 242 public static BidiFormatter getInstance(boolean rtlContext) { 243 return new Builder(rtlContext).build(); 244 } 245 246 /** 247 * Factory for creating an instance of BidiFormatter given the context locale. 248 * 249 * @param locale The context locale. 250 */ 251 public static BidiFormatter getInstance(Locale locale) { 252 return new Builder(locale).build(); 253 } 254 255 /** 256 * @param isRtlContext Whether the context directionality is RTL or not. 257 * @param flags The option flags. 258 * @param heuristic The default text direction heuristic. 259 */ 260 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) { 261 mIsRtlContext = isRtlContext; 262 mFlags = flags; 263 mDefaultTextDirectionHeuristicCompat = heuristic; 264 } 265 266 /** 267 * @return Whether the context directionality is RTL 268 */ 269 public boolean isRtlContext() { 270 return mIsRtlContext; 271 } 272 273 /** 274 * @return Whether directionality "reset" should also be done before a string being 275 * bidi-wrapped, not just after it. 276 */ 277 public boolean getStereoReset() { 278 return (mFlags & FLAG_STEREO_RESET) != 0; 279 } 280 281 /** 282 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 283 * overall or the exit directionality of a given string is opposite to the context directionality. 284 * Putting this after the string (including its directionality declaration wrapping) prevents it 285 * from "sticking" to other opposite-directionality text or a number appearing after it inline 286 * with only neutral content in between. Otherwise returns the empty string. While the exit 287 * directionality is determined by scanning the end of the string, the overall directionality is 288 * given explicitly by a heuristic to estimate the {@code str}'s directionality. 289 * 290 * @param str String after which the mark may need to appear. 291 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 292 * directionality. 293 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 294 * else, the empty string. 295 */ 296 private String markAfter(String str, TextDirectionHeuristicCompat heuristic) { 297 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 298 // getExitDir() is called only if needed (short-circuit). 299 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 300 return LRM_STRING; 301 } 302 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 303 return RLM_STRING; 304 } 305 return EMPTY_STRING; 306 } 307 308 /** 309 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 310 * overall or the entry directionality of a given string is opposite to the context 311 * directionality. Putting this before the string (including its directionality declaration 312 * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before 313 * it inline with only neutral content in between. Otherwise returns the empty string. While the 314 * entry directionality is determined by scanning the beginning of the string, the overall 315 * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. 316 * 317 * @param str String before which the mark may need to appear. 318 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 319 * directionality. 320 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 321 * else, the empty string. 322 */ 323 private String markBefore(String str, TextDirectionHeuristicCompat heuristic) { 324 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 325 // getEntryDir() is called only if needed (short-circuit). 326 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 327 return LRM_STRING; 328 } 329 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 330 return RLM_STRING; 331 } 332 return EMPTY_STRING; 333 } 334 335 /** 336 * Estimates the directionality of a string using the default text direction heuristic. 337 * 338 * @param str String whose directionality is to be estimated. 339 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 340 * false. 341 */ 342 public boolean isRtl(String str) { 343 return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length()); 344 } 345 346 /** 347 * Formats a string of given directionality for use in plain-text output of the context 348 * directionality, so an opposite-directionality string is neither garbled nor garbles its 349 * surroundings. This makes use of Unicode bidi formatting characters. 350 * <p> 351 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 352 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 353 * LRE+{@code str}+PDF for LTR text. 354 * <p> 355 * If {@code isolate}, directionally isolates the string so that it does not garble its 356 * surroundings. Currently, this is done by "resetting" the directionality after the string by 357 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 358 * either the overall directionality or the exit directionality of the string is opposite to that 359 * of the context. If the formatter was built using {@link Builder#stereoReset(boolean)} and 360 * passing "true" as an argument, also prepends a Unicode bidi mark matching the context 361 * directionality when either the overall directionality or the entry directionality of the 362 * string is opposite to that of the context. Note that as opposed to the overall 363 * directionality, the entry and exit directionalities are determined from the string itself. 364 * <p> 365 * Does *not* do HTML-escaping. 366 * 367 * @param str The input string. 368 * @param heuristic The algorithm to be used to estimate the string's overall direction. 369 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 370 * content around it 371 * @return Input string after applying the above processing. 372 */ 373 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) { 374 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 375 StringBuilder result = new StringBuilder(); 376 if (getStereoReset() && isolate) { 377 result.append(markBefore(str, 378 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 379 } 380 if (isRtl != mIsRtlContext) { 381 result.append(isRtl ? RLE : LRE); 382 result.append(str); 383 result.append(PDF); 384 } else { 385 result.append(str); 386 } 387 if (isolate) { 388 result.append(markAfter(str, 389 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 390 } 391 return result.toString(); 392 } 393 394 /** 395 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes 396 * {@code isolate} is true. 397 * 398 * @param str The input string. 399 * @param heuristic The algorithm to be used to estimate the string's overall direction. 400 * @return Input string after applying the above processing. 401 */ 402 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) { 403 return unicodeWrap(str, heuristic, true /* isolate */); 404 } 405 406 /** 407 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the 408 * formatter's default direction estimation algorithm. 409 * 410 * @param str The input string. 411 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 412 * content around it 413 * @return Input string after applying the above processing. 414 */ 415 public String unicodeWrap(String str, boolean isolate) { 416 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate); 417 } 418 419 /** 420 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the 421 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 422 * 423 * @param str The input string. 424 * @return Input string after applying the above processing. 425 */ 426 public String unicodeWrap(String str) { 427 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */); 428 } 429 430 /** 431 * Helper method to return true if the Locale directionality is RTL. 432 * 433 * @param locale The Locale whose directionality will be checked to be RTL or LTR 434 * @return true if the {@code locale} directionality is RTL. False otherwise. 435 */ 436 private static boolean isRtlLocale(Locale locale) { 437 return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL); 438 } 439 440 /** 441 * Enum for directionality type. 442 */ 443 private static final int DIR_LTR = -1; 444 private static final int DIR_UNKNOWN = 0; 445 private static final int DIR_RTL = +1; 446 447 /** 448 * Returns the directionality of the last character with strong directionality in the string, or 449 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 450 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 451 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 452 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 453 * whether a logically separate item that starts with a number or a character of the string's 454 * exit directionality and follows this string inline (not counting any neutral characters in 455 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 456 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 457 * between the two will prevent such sticking. 458 * 459 * @param str the string to check. 460 */ 461 private static int getExitDir(String str) { 462 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 463 } 464 465 /** 466 * Returns the directionality of the first character with strong directionality in the string, 467 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 468 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 469 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 470 * characters. The intended use is to check whether a logically separate item that ends with a 471 * character of the string's entry directionality and precedes the string inline (not counting 472 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 473 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 474 * context's directionality) between the two will prevent such sticking. 475 * 476 * @param str the string to check. 477 */ 478 private static int getEntryDir(String str) { 479 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 480 } 481 482 /** 483 * An object that estimates the directionality of a given string by various methods. 484 * 485 */ 486 private static class DirectionalityEstimator { 487 488 // Internal static variables and constants. 489 490 /** 491 * Size of the bidi character class cache. The results of the Character.getDirectionality() 492 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 493 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 494 * cache. It can be reduced to 0x180, restricting the cache to the Western European 495 * languages. 496 */ 497 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 498 499 /** 500 * The bidi character class cache. 501 */ 502 private static final byte DIR_TYPE_CACHE[]; 503 504 static { 505 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 506 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 507 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 508 } 509 } 510 511 // Internal instance variables. 512 513 /** 514 * The text to be scanned. 515 */ 516 private final String text; 517 518 /** 519 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 520 * entities when looking for the next / preceding dir type. 521 */ 522 private final boolean isHtml; 523 524 /** 525 * The length of the text in chars. 526 */ 527 private final int length; 528 529 /** 530 * The current position in the text. 531 */ 532 private int charIndex; 533 534 /** 535 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 536 * encountered a supplementary codepoint, this contains a char that is not a valid 537 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 538 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 539 */ 540 private char lastChar; 541 542 /** 543 * Constructor. 544 * 545 * @param text The string to scan. 546 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 547 * tags and entities. 548 */ 549 DirectionalityEstimator(String text, boolean isHtml) { 550 this.text = text; 551 this.isHtml = isHtml; 552 length = text.length(); 553 } 554 555 /** 556 * Returns the directionality of the first character with strong directionality in the 557 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 558 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 559 * after RLE/RLO. The results are undefined for a string containing unbalanced 560 * LRE/RLE/LRO/RLO/PDF characters. 561 */ 562 int getEntryDir() { 563 // The reason for this method name, as opposed to getFirstStrongDir(), is that 564 // "first strong" is a commonly used description of Unicode's estimation algorithm, 565 // but the two must treat formatting characters quite differently. Thus, we are staying 566 // away from both "first" and "last" in these method names to avoid confusion. 567 charIndex = 0; 568 int embeddingLevel = 0; 569 int embeddingLevelDir = DIR_UNKNOWN; 570 int firstNonEmptyEmbeddingLevel = 0; 571 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 572 switch (dirTypeForward()) { 573 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 574 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 575 ++embeddingLevel; 576 embeddingLevelDir = DIR_LTR; 577 break; 578 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 579 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 580 ++embeddingLevel; 581 embeddingLevelDir = DIR_RTL; 582 break; 583 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 584 --embeddingLevel; 585 // To restore embeddingLevelDir to its previous value, we would need a 586 // stack, which we want to avoid. Thus, at this point we do not know the 587 // current embedding's directionality. 588 embeddingLevelDir = DIR_UNKNOWN; 589 break; 590 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 591 break; 592 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 593 if (embeddingLevel == 0) { 594 return DIR_LTR; 595 } 596 firstNonEmptyEmbeddingLevel = embeddingLevel; 597 break; 598 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 599 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 600 if (embeddingLevel == 0) { 601 return DIR_RTL; 602 } 603 firstNonEmptyEmbeddingLevel = embeddingLevel; 604 break; 605 default: 606 firstNonEmptyEmbeddingLevel = embeddingLevel; 607 break; 608 } 609 } 610 611 // We have either found a non-empty embedding or scanned the entire string finding 612 // neither a non-empty embedding nor a strong character outside of an embedding. 613 if (firstNonEmptyEmbeddingLevel == 0) { 614 // We have not found a non-empty embedding. Thus, the string contains neither a 615 // non-empty embedding nor a strong character outside of an embedding. 616 return DIR_UNKNOWN; 617 } 618 619 // We have found a non-empty embedding. 620 if (embeddingLevelDir != DIR_UNKNOWN) { 621 // We know the directionality of the non-empty embedding. 622 return embeddingLevelDir; 623 } 624 625 // We do not remember the directionality of the non-empty embedding we found. So, we go 626 // backwards to find the start of the non-empty embedding and get its directionality. 627 while (charIndex > 0) { 628 switch (dirTypeBackward()) { 629 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 630 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 631 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 632 return DIR_LTR; 633 } 634 --embeddingLevel; 635 break; 636 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 637 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 638 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 639 return DIR_RTL; 640 } 641 --embeddingLevel; 642 break; 643 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 644 ++embeddingLevel; 645 break; 646 } 647 } 648 // We should never get here. 649 return DIR_UNKNOWN; 650 } 651 652 /** 653 * Returns the directionality of the last character with strong directionality in the 654 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 655 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 656 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 657 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 658 */ 659 int getExitDir() { 660 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 661 // strong" sounds like the exact opposite of "first strong", which is a commonly used 662 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 663 // must treat formatting characters quite differently. Thus, we are staying away from 664 // both "first" and "last" in these method names to avoid confusion. 665 charIndex = length; 666 int embeddingLevel = 0; 667 int lastNonEmptyEmbeddingLevel = 0; 668 while (charIndex > 0) { 669 switch (dirTypeBackward()) { 670 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 671 if (embeddingLevel == 0) { 672 return DIR_LTR; 673 } 674 if (lastNonEmptyEmbeddingLevel == 0) { 675 lastNonEmptyEmbeddingLevel = embeddingLevel; 676 } 677 break; 678 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 679 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 680 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 681 return DIR_LTR; 682 } 683 --embeddingLevel; 684 break; 685 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 686 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 687 if (embeddingLevel == 0) { 688 return DIR_RTL; 689 } 690 if (lastNonEmptyEmbeddingLevel == 0) { 691 lastNonEmptyEmbeddingLevel = embeddingLevel; 692 } 693 break; 694 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 695 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 696 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 697 return DIR_RTL; 698 } 699 --embeddingLevel; 700 break; 701 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 702 ++embeddingLevel; 703 break; 704 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 705 break; 706 default: 707 if (lastNonEmptyEmbeddingLevel == 0) { 708 lastNonEmptyEmbeddingLevel = embeddingLevel; 709 } 710 break; 711 } 712 } 713 return DIR_UNKNOWN; 714 } 715 716 // Internal methods 717 718 /** 719 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 720 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 721 * cache. 722 */ 723 private static byte getCachedDirectionality(char c) { 724 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c); 725 } 726 727 /** 728 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 729 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 730 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 731 * figure out the actual character, and return its dirtype, but treating it as whitespace is 732 * good enough for our purposes. 733 * 734 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 735 */ 736 byte dirTypeForward() { 737 lastChar = text.charAt(charIndex); 738 if (Character.isHighSurrogate(lastChar)) { 739 int codePoint = Character.codePointAt(text, charIndex); 740 charIndex += Character.charCount(codePoint); 741 return Character.getDirectionality(codePoint); 742 } 743 charIndex++; 744 byte dirType = getCachedDirectionality(lastChar); 745 if (isHtml) { 746 // Process tags and entities. 747 if (lastChar == '<') { 748 dirType = skipTagForward(); 749 } else if (lastChar == '&') { 750 dirType = skipEntityForward(); 751 } 752 } 753 return dirType; 754 } 755 756 /** 757 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 758 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 759 * entity, advances over the whole tag/entity and returns 760 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 761 * actual character, and return its dirtype, but treating it as whitespace is good enough 762 * for our purposes. 763 * 764 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 765 */ 766 byte dirTypeBackward() { 767 lastChar = text.charAt(charIndex - 1); 768 if (Character.isLowSurrogate(lastChar)) { 769 int codePoint = Character.codePointBefore(text, charIndex); 770 charIndex -= Character.charCount(codePoint); 771 return Character.getDirectionality(codePoint); 772 } 773 charIndex--; 774 byte dirType = getCachedDirectionality(lastChar); 775 if (isHtml) { 776 // Process tags and entities. 777 if (lastChar == '>') { 778 dirType = skipTagBackward(); 779 } else if (lastChar == ';') { 780 dirType = skipEntityBackward(); 781 } 782 } 783 return dirType; 784 } 785 786 /** 787 * Advances charIndex forward through an HTML tag (after the opening < has already been 788 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 789 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 790 * < that hadn't been part of a tag after all). 791 */ 792 private byte skipTagForward() { 793 int initialCharIndex = charIndex; 794 while (charIndex < length) { 795 lastChar = text.charAt(charIndex++); 796 if (lastChar == '>') { 797 // The end of the tag. 798 return Character.DIRECTIONALITY_WHITESPACE; 799 } 800 if (lastChar == '"' || lastChar == '\'') { 801 // Skip over a quoted attribute value inside the tag. 802 char quote = lastChar; 803 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 804 } 805 } 806 // The original '<' wasn't the start of a tag after all. 807 charIndex = initialCharIndex; 808 lastChar = '<'; 809 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 810 } 811 812 /** 813 * Advances charIndex backward through an HTML tag (after the closing > has already been 814 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 815 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 816 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 817 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 818 * ">>>>", because skipTagBackward() also stops looking for a matching < 819 * when it encounters another >. 820 */ 821 private byte skipTagBackward() { 822 int initialCharIndex = charIndex; 823 while (charIndex > 0) { 824 lastChar = text.charAt(--charIndex); 825 if (lastChar == '<') { 826 // The start of the tag. 827 return Character.DIRECTIONALITY_WHITESPACE; 828 } 829 if (lastChar == '>') { 830 break; 831 } 832 if (lastChar == '"' || lastChar == '\'') { 833 // Skip over a quoted attribute value inside the tag. 834 char quote = lastChar; 835 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 836 } 837 } 838 // The original '>' wasn't the end of a tag after all. 839 charIndex = initialCharIndex; 840 lastChar = '>'; 841 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 842 } 843 844 /** 845 * Advances charIndex forward through an HTML character entity tag (after the opening 846 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 847 * best to figure out the actual character and return its dirtype, but this is good enough. 848 */ 849 private byte skipEntityForward() { 850 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 851 return Character.DIRECTIONALITY_WHITESPACE; 852 } 853 854 /** 855 * Advances charIndex backward through an HTML character entity tag (after the closing ; 856 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 857 * to figure out the actual character and return its dirtype, but this is good enough. 858 * If there is no matching &, does not change charIndex and returns 859 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 860 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 861 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 862 * also stops looking for a matching & when it encounters another ;. 863 */ 864 private byte skipEntityBackward() { 865 int initialCharIndex = charIndex; 866 while (charIndex > 0) { 867 lastChar = text.charAt(--charIndex); 868 if (lastChar == '&') { 869 return Character.DIRECTIONALITY_WHITESPACE; 870 } 871 if (lastChar == ';') { 872 break; 873 } 874 } 875 charIndex = initialCharIndex; 876 lastChar = ';'; 877 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 878 } 879 } 880}