1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.support.v4.text; 18 19import android.support.v4.view.ViewCompat; 20 21import java.util.Locale; 22 23import static android.support.v4.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR; 24 25/** 26 * Utility class for formatting text for display in a potentially opposite-directionality context 27 * without garbling. The directionality of the context is set at formatter creation and the 28 * directionality of the text can be either estimated or passed in when known. Provides the 29 * following functionality: 30 * <p> 31 * 1. Bidi Wrapping 32 * When text in one language is mixed into a document in another, opposite-directionality language, 33 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string 34 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 35 * separated from the surrounding text in a "wrapper" that: 36 * <p> 37 * - Declares its directionality so that the string is displayed correctly. This can be done in 38 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 39 * <p> 40 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 41 * Currently, this can only be done using invisible Unicode characters of the same direction as 42 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 43 * the directionality to that of the context. The "reset" may need to be done at both ends of the 44 * string. Without "reset" after the string, the string will "stick" to a number or logically 45 * separate opposite-direction text that happens to follow it in-line (even if separated by 46 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 47 * happen there, but only with more opposite-direction text, not a number. One approach is to 48 * "reset" the direction only after each string, on the theory that if the preceding opposite- 49 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 50 * the "reset" only before each string definitely does not work because we do not want to require 51 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 52 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 53 * message translations often contain untranslated Latin-script brand names and technical terms, 54 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 55 * has such a message, it is best to do the "reset" manually in the message translation itself, 56 * since the message's opposite-direction text could be followed by an inserted number, which we 57 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 58 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 59 * isolation to be part of the directionality declaration. This form of isolation is better than 60 * "reset" because it takes less space, does not require knowing the context directionality, has a 61 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 62 * using it because required platforms do not yet support it. 63 * <p> 64 * Providing these wrapping services is the basic purpose of the bidi formatter. 65 * <p> 66 * 2. Directionality estimation 67 * How does one know whether a string about to be inserted into surrounding text has the same 68 * directionality? Well, in many cases, one knows that this must be the case when writing the code 69 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 70 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 71 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 72 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 73 * language of the string (and thus its directionality) is not known a priori, and must be 74 * estimated at run-time. The bidi formatter can do this automatically using the default 75 * first-strong estimation algorithm. It can also be configured to use a custom directionality 76 * estimation object. 77 */ 78public final class BidiFormatter { 79 80 /** 81 * The default text direction heuristic. 82 */ 83 private static TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 84 85 /** 86 * Unicode "Left-To-Right Embedding" (LRE) character. 87 */ 88 private static final char LRE = '\u202A'; 89 90 /** 91 * Unicode "Right-To-Left Embedding" (RLE) character. 92 */ 93 private static final char RLE = '\u202B'; 94 95 /** 96 * Unicode "Pop Directional Formatting" (PDF) character. 97 */ 98 private static final char PDF = '\u202C'; 99 100 /** 101 * Unicode "Left-To-Right Mark" (LRM) character. 102 */ 103 private static final char LRM = '\u200E'; 104 105 /* 106 * Unicode "Right-To-Left Mark" (RLM) character. 107 */ 108 private static final char RLM = '\u200F'; 109 110 /* 111 * String representation of LRM 112 */ 113 private static final String LRM_STRING = Character.toString(LRM); 114 115 /* 116 * String representation of RLM 117 */ 118 private static final String RLM_STRING = Character.toString(RLM); 119 120 /** 121 * Empty string constant. 122 */ 123 private static final String EMPTY_STRING = ""; 124 125 /** 126 * A class for building a BidiFormatter with non-default options. 127 */ 128 public static final class Builder { 129 private boolean mIsRtlContext; 130 private int mFlags; 131 private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat; 132 133 /** 134 * Constructor. 135 * 136 */ 137 public Builder() { 138 initialize(isRtlLocale(Locale.getDefault())); 139 } 140 141 /** 142 * Constructor. 143 * 144 * @param rtlContext Whether the context directionality is RTL. 145 */ 146 public Builder(boolean rtlContext) { 147 initialize(rtlContext); 148 } 149 150 /** 151 * Constructor. 152 * 153 * @param locale The context locale. 154 */ 155 public Builder(Locale locale) { 156 initialize(isRtlLocale(locale)); 157 } 158 159 /** 160 * Initializes the builder with the given context directionality and default options. 161 * 162 * @param isRtlContext Whether the context is RTL or not. 163 */ 164 private void initialize(boolean isRtlContext) { 165 mIsRtlContext = isRtlContext; 166 mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC; 167 mFlags = DEFAULT_FLAGS; 168 } 169 170 /** 171 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 172 * a string being bidi-wrapped, not just after it. The default is true. 173 */ 174 public Builder stereoReset(boolean stereoReset) { 175 if (stereoReset) { 176 mFlags |= FLAG_STEREO_RESET; 177 } else { 178 mFlags &= ~FLAG_STEREO_RESET; 179 } 180 return this; 181 } 182 183 /** 184 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 185 * By default, uses the first-strong heuristic. 186 * 187 * @param heuristic the {@code TextDirectionHeuristic} to use. 188 * @return the builder itself. 189 */ 190 public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) { 191 mTextDirectionHeuristicCompat = heuristic; 192 return this; 193 } 194 195 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 196 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 197 } 198 199 /** 200 * @return A BidiFormatter with the specified options. 201 */ 202 public BidiFormatter build() { 203 if (mFlags == DEFAULT_FLAGS && 204 mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 205 return getDefaultInstanceFromContext(mIsRtlContext); 206 } 207 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat); 208 } 209 } 210 211 // 212 private static final int FLAG_STEREO_RESET = 2; 213 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 214 215 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 216 false /* LTR context */, 217 DEFAULT_FLAGS, 218 DEFAULT_TEXT_DIRECTION_HEURISTIC); 219 220 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 221 true /* RTL context */, 222 DEFAULT_FLAGS, 223 DEFAULT_TEXT_DIRECTION_HEURISTIC); 224 225 private final boolean mIsRtlContext; 226 private final int mFlags; 227 private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat; 228 229 /** 230 * Factory for creating an instance of BidiFormatter for the default locale directionality. 231 * 232 */ 233 public static BidiFormatter getInstance() { 234 return new Builder().build(); 235 } 236 237 /** 238 * Factory for creating an instance of BidiFormatter given the context directionality. 239 * 240 * @param rtlContext Whether the context directionality is RTL. 241 */ 242 public static BidiFormatter getInstance(boolean rtlContext) { 243 return new Builder(rtlContext).build(); 244 } 245 246 /** 247 * Factory for creating an instance of BidiFormatter given the context locale. 248 * 249 * @param locale The context locale. 250 */ 251 public static BidiFormatter getInstance(Locale locale) { 252 return new Builder(locale).build(); 253 } 254 255 /** 256 * @param isRtlContext Whether the context directionality is RTL or not. 257 * @param flags The option flags. 258 * @param heuristic The default text direction heuristic. 259 */ 260 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) { 261 mIsRtlContext = isRtlContext; 262 mFlags = flags; 263 mDefaultTextDirectionHeuristicCompat = heuristic; 264 } 265 266 /** 267 * @return Whether the context directionality is RTL 268 */ 269 public boolean isRtlContext() { 270 return mIsRtlContext; 271 } 272 273 /** 274 * @return Whether directionality "reset" should also be done before a string being 275 * bidi-wrapped, not just after it. 276 */ 277 public boolean getStereoReset() { 278 return (mFlags & FLAG_STEREO_RESET) != 0; 279 } 280 281 /** 282 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 283 * overall or the exit directionality of a given string is opposite to the context directionality. 284 * Putting this after the string (including its directionality declaration wrapping) prevents it 285 * from "sticking" to other opposite-directionality text or a number appearing after it inline 286 * with only neutral content in between. Otherwise returns the empty string. While the exit 287 * directionality is determined by scanning the end of the string, the overall directionality is 288 * given explicitly by a heuristic to estimate the {@code str}'s directionality. 289 * 290 * @param str String after which the mark may need to appear. 291 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 292 * directionality. 293 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 294 * else, the empty string. 295 */ 296 private String markAfter(String str, TextDirectionHeuristicCompat heuristic) { 297 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 298 // getExitDir() is called only if needed (short-circuit). 299 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 300 return LRM_STRING; 301 } 302 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 303 return RLM_STRING; 304 } 305 return EMPTY_STRING; 306 } 307 308 /** 309 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 310 * overall or the entry directionality of a given string is opposite to the context 311 * directionality. Putting this before the string (including its directionality declaration 312 * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before 313 * it inline with only neutral content in between. Otherwise returns the empty string. While the 314 * entry directionality is determined by scanning the beginning of the string, the overall 315 * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. 316 * 317 * @param str String before which the mark may need to appear. 318 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 319 * directionality. 320 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 321 * else, the empty string. 322 */ 323 private String markBefore(String str, TextDirectionHeuristicCompat heuristic) { 324 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 325 // getEntryDir() is called only if needed (short-circuit). 326 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 327 return LRM_STRING; 328 } 329 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 330 return RLM_STRING; 331 } 332 return EMPTY_STRING; 333 } 334 335 /** 336 * Estimates the directionality of a string using the default text direction heuristic. 337 * 338 * @param str String whose directionality is to be estimated. 339 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 340 * false. 341 */ 342 public boolean isRtl(String str) { 343 return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length()); 344 } 345 346 /** 347 * Formats a string of given directionality for use in plain-text output of the context 348 * directionality, so an opposite-directionality string is neither garbled nor garbles its 349 * surroundings. This makes use of Unicode bidi formatting characters. 350 * <p> 351 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 352 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 353 * LRE+{@code str}+PDF for LTR text. 354 * <p> 355 * If {@code isolate}, directionally isolates the string so that it does not garble its 356 * surroundings. Currently, this is done by "resetting" the directionality after the string by 357 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 358 * either the overall directionality or the exit directionality of the string is opposite to 359 * that of the context. Unless the formatter was built using 360 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 361 * bidi mark matching the context directionality when either the overall directionality or the 362 * entry directionality of the string is opposite to that of the context. Note that as opposed 363 * to the overall directionality, the entry and exit directionalities are determined from the 364 * string itself. 365 * <p> 366 * Does *not* do HTML-escaping. 367 * 368 * @param str The input string. 369 * @param heuristic The algorithm to be used to estimate the string's overall direction. 370 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 371 * content around it 372 * @return Input string after applying the above processing. {@code null} if {@code str} is 373 * {@code null}. 374 */ 375 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) { 376 if (str == null) return null; 377 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 378 StringBuilder result = new StringBuilder(); 379 if (getStereoReset() && isolate) { 380 result.append(markBefore(str, 381 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 382 } 383 if (isRtl != mIsRtlContext) { 384 result.append(isRtl ? RLE : LRE); 385 result.append(str); 386 result.append(PDF); 387 } else { 388 result.append(str); 389 } 390 if (isolate) { 391 result.append(markAfter(str, 392 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 393 } 394 return result.toString(); 395 } 396 397 /** 398 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes 399 * {@code isolate} is true. 400 * 401 * @param str The input string. 402 * @param heuristic The algorithm to be used to estimate the string's overall direction. 403 * @return Input string after applying the above processing. 404 */ 405 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) { 406 return unicodeWrap(str, heuristic, true /* isolate */); 407 } 408 409 /** 410 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the 411 * formatter's default direction estimation algorithm. 412 * 413 * @param str The input string. 414 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 415 * content around it 416 * @return Input string after applying the above processing. 417 */ 418 public String unicodeWrap(String str, boolean isolate) { 419 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate); 420 } 421 422 /** 423 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the 424 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 425 * 426 * @param str The input string. 427 * @return Input string after applying the above processing. 428 */ 429 public String unicodeWrap(String str) { 430 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */); 431 } 432 433 /** 434 * Helper method to return true if the Locale directionality is RTL. 435 * 436 * @param locale The Locale whose directionality will be checked to be RTL or LTR 437 * @return true if the {@code locale} directionality is RTL. False otherwise. 438 */ 439 private static boolean isRtlLocale(Locale locale) { 440 return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL); 441 } 442 443 /** 444 * Enum for directionality type. 445 */ 446 private static final int DIR_LTR = -1; 447 private static final int DIR_UNKNOWN = 0; 448 private static final int DIR_RTL = +1; 449 450 /** 451 * Returns the directionality of the last character with strong directionality in the string, or 452 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 453 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 454 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 455 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 456 * whether a logically separate item that starts with a number or a character of the string's 457 * exit directionality and follows this string inline (not counting any neutral characters in 458 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 459 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 460 * between the two will prevent such sticking. 461 * 462 * @param str the string to check. 463 */ 464 private static int getExitDir(String str) { 465 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 466 } 467 468 /** 469 * Returns the directionality of the first character with strong directionality in the string, 470 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 471 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 472 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 473 * characters. The intended use is to check whether a logically separate item that ends with a 474 * character of the string's entry directionality and precedes the string inline (not counting 475 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 476 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 477 * context's directionality) between the two will prevent such sticking. 478 * 479 * @param str the string to check. 480 */ 481 private static int getEntryDir(String str) { 482 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 483 } 484 485 /** 486 * An object that estimates the directionality of a given string by various methods. 487 * 488 */ 489 private static class DirectionalityEstimator { 490 491 // Internal static variables and constants. 492 493 /** 494 * Size of the bidi character class cache. The results of the Character.getDirectionality() 495 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 496 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 497 * cache. It can be reduced to 0x180, restricting the cache to the Western European 498 * languages. 499 */ 500 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 501 502 /** 503 * The bidi character class cache. 504 */ 505 private static final byte DIR_TYPE_CACHE[]; 506 507 static { 508 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 509 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 510 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 511 } 512 } 513 514 // Internal instance variables. 515 516 /** 517 * The text to be scanned. 518 */ 519 private final String text; 520 521 /** 522 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 523 * entities when looking for the next / preceding dir type. 524 */ 525 private final boolean isHtml; 526 527 /** 528 * The length of the text in chars. 529 */ 530 private final int length; 531 532 /** 533 * The current position in the text. 534 */ 535 private int charIndex; 536 537 /** 538 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 539 * encountered a supplementary codepoint, this contains a char that is not a valid 540 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 541 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 542 */ 543 private char lastChar; 544 545 /** 546 * Constructor. 547 * 548 * @param text The string to scan. 549 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 550 * tags and entities. 551 */ 552 DirectionalityEstimator(String text, boolean isHtml) { 553 this.text = text; 554 this.isHtml = isHtml; 555 length = text.length(); 556 } 557 558 /** 559 * Returns the directionality of the first character with strong directionality in the 560 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 561 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 562 * after RLE/RLO. The results are undefined for a string containing unbalanced 563 * LRE/RLE/LRO/RLO/PDF characters. 564 */ 565 int getEntryDir() { 566 // The reason for this method name, as opposed to getFirstStrongDir(), is that 567 // "first strong" is a commonly used description of Unicode's estimation algorithm, 568 // but the two must treat formatting characters quite differently. Thus, we are staying 569 // away from both "first" and "last" in these method names to avoid confusion. 570 charIndex = 0; 571 int embeddingLevel = 0; 572 int embeddingLevelDir = DIR_UNKNOWN; 573 int firstNonEmptyEmbeddingLevel = 0; 574 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 575 switch (dirTypeForward()) { 576 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 577 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 578 ++embeddingLevel; 579 embeddingLevelDir = DIR_LTR; 580 break; 581 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 582 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 583 ++embeddingLevel; 584 embeddingLevelDir = DIR_RTL; 585 break; 586 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 587 --embeddingLevel; 588 // To restore embeddingLevelDir to its previous value, we would need a 589 // stack, which we want to avoid. Thus, at this point we do not know the 590 // current embedding's directionality. 591 embeddingLevelDir = DIR_UNKNOWN; 592 break; 593 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 594 break; 595 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 596 if (embeddingLevel == 0) { 597 return DIR_LTR; 598 } 599 firstNonEmptyEmbeddingLevel = embeddingLevel; 600 break; 601 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 602 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 603 if (embeddingLevel == 0) { 604 return DIR_RTL; 605 } 606 firstNonEmptyEmbeddingLevel = embeddingLevel; 607 break; 608 default: 609 firstNonEmptyEmbeddingLevel = embeddingLevel; 610 break; 611 } 612 } 613 614 // We have either found a non-empty embedding or scanned the entire string finding 615 // neither a non-empty embedding nor a strong character outside of an embedding. 616 if (firstNonEmptyEmbeddingLevel == 0) { 617 // We have not found a non-empty embedding. Thus, the string contains neither a 618 // non-empty embedding nor a strong character outside of an embedding. 619 return DIR_UNKNOWN; 620 } 621 622 // We have found a non-empty embedding. 623 if (embeddingLevelDir != DIR_UNKNOWN) { 624 // We know the directionality of the non-empty embedding. 625 return embeddingLevelDir; 626 } 627 628 // We do not remember the directionality of the non-empty embedding we found. So, we go 629 // backwards to find the start of the non-empty embedding and get its directionality. 630 while (charIndex > 0) { 631 switch (dirTypeBackward()) { 632 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 633 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 634 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 635 return DIR_LTR; 636 } 637 --embeddingLevel; 638 break; 639 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 640 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 641 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 642 return DIR_RTL; 643 } 644 --embeddingLevel; 645 break; 646 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 647 ++embeddingLevel; 648 break; 649 } 650 } 651 // We should never get here. 652 return DIR_UNKNOWN; 653 } 654 655 /** 656 * Returns the directionality of the last character with strong directionality in the 657 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 658 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 659 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 660 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 661 */ 662 int getExitDir() { 663 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 664 // strong" sounds like the exact opposite of "first strong", which is a commonly used 665 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 666 // must treat formatting characters quite differently. Thus, we are staying away from 667 // both "first" and "last" in these method names to avoid confusion. 668 charIndex = length; 669 int embeddingLevel = 0; 670 int lastNonEmptyEmbeddingLevel = 0; 671 while (charIndex > 0) { 672 switch (dirTypeBackward()) { 673 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 674 if (embeddingLevel == 0) { 675 return DIR_LTR; 676 } 677 if (lastNonEmptyEmbeddingLevel == 0) { 678 lastNonEmptyEmbeddingLevel = embeddingLevel; 679 } 680 break; 681 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 682 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 683 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 684 return DIR_LTR; 685 } 686 --embeddingLevel; 687 break; 688 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 689 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 690 if (embeddingLevel == 0) { 691 return DIR_RTL; 692 } 693 if (lastNonEmptyEmbeddingLevel == 0) { 694 lastNonEmptyEmbeddingLevel = embeddingLevel; 695 } 696 break; 697 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 698 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 699 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 700 return DIR_RTL; 701 } 702 --embeddingLevel; 703 break; 704 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 705 ++embeddingLevel; 706 break; 707 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 708 break; 709 default: 710 if (lastNonEmptyEmbeddingLevel == 0) { 711 lastNonEmptyEmbeddingLevel = embeddingLevel; 712 } 713 break; 714 } 715 } 716 return DIR_UNKNOWN; 717 } 718 719 // Internal methods 720 721 /** 722 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 723 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 724 * cache. 725 */ 726 private static byte getCachedDirectionality(char c) { 727 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c); 728 } 729 730 /** 731 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 732 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 733 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 734 * figure out the actual character, and return its dirtype, but treating it as whitespace is 735 * good enough for our purposes. 736 * 737 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 738 */ 739 byte dirTypeForward() { 740 lastChar = text.charAt(charIndex); 741 if (Character.isHighSurrogate(lastChar)) { 742 int codePoint = Character.codePointAt(text, charIndex); 743 charIndex += Character.charCount(codePoint); 744 return Character.getDirectionality(codePoint); 745 } 746 charIndex++; 747 byte dirType = getCachedDirectionality(lastChar); 748 if (isHtml) { 749 // Process tags and entities. 750 if (lastChar == '<') { 751 dirType = skipTagForward(); 752 } else if (lastChar == '&') { 753 dirType = skipEntityForward(); 754 } 755 } 756 return dirType; 757 } 758 759 /** 760 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 761 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 762 * entity, advances over the whole tag/entity and returns 763 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 764 * actual character, and return its dirtype, but treating it as whitespace is good enough 765 * for our purposes. 766 * 767 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 768 */ 769 byte dirTypeBackward() { 770 lastChar = text.charAt(charIndex - 1); 771 if (Character.isLowSurrogate(lastChar)) { 772 int codePoint = Character.codePointBefore(text, charIndex); 773 charIndex -= Character.charCount(codePoint); 774 return Character.getDirectionality(codePoint); 775 } 776 charIndex--; 777 byte dirType = getCachedDirectionality(lastChar); 778 if (isHtml) { 779 // Process tags and entities. 780 if (lastChar == '>') { 781 dirType = skipTagBackward(); 782 } else if (lastChar == ';') { 783 dirType = skipEntityBackward(); 784 } 785 } 786 return dirType; 787 } 788 789 /** 790 * Advances charIndex forward through an HTML tag (after the opening < has already been 791 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 792 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 793 * < that hadn't been part of a tag after all). 794 */ 795 private byte skipTagForward() { 796 int initialCharIndex = charIndex; 797 while (charIndex < length) { 798 lastChar = text.charAt(charIndex++); 799 if (lastChar == '>') { 800 // The end of the tag. 801 return Character.DIRECTIONALITY_WHITESPACE; 802 } 803 if (lastChar == '"' || lastChar == '\'') { 804 // Skip over a quoted attribute value inside the tag. 805 char quote = lastChar; 806 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 807 } 808 } 809 // The original '<' wasn't the start of a tag after all. 810 charIndex = initialCharIndex; 811 lastChar = '<'; 812 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 813 } 814 815 /** 816 * Advances charIndex backward through an HTML tag (after the closing > has already been 817 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 818 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 819 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 820 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 821 * ">>>>", because skipTagBackward() also stops looking for a matching < 822 * when it encounters another >. 823 */ 824 private byte skipTagBackward() { 825 int initialCharIndex = charIndex; 826 while (charIndex > 0) { 827 lastChar = text.charAt(--charIndex); 828 if (lastChar == '<') { 829 // The start of the tag. 830 return Character.DIRECTIONALITY_WHITESPACE; 831 } 832 if (lastChar == '>') { 833 break; 834 } 835 if (lastChar == '"' || lastChar == '\'') { 836 // Skip over a quoted attribute value inside the tag. 837 char quote = lastChar; 838 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 839 } 840 } 841 // The original '>' wasn't the end of a tag after all. 842 charIndex = initialCharIndex; 843 lastChar = '>'; 844 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 845 } 846 847 /** 848 * Advances charIndex forward through an HTML character entity tag (after the opening 849 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 850 * best to figure out the actual character and return its dirtype, but this is good enough. 851 */ 852 private byte skipEntityForward() { 853 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 854 return Character.DIRECTIONALITY_WHITESPACE; 855 } 856 857 /** 858 * Advances charIndex backward through an HTML character entity tag (after the closing ; 859 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 860 * to figure out the actual character and return its dirtype, but this is good enough. 861 * If there is no matching &, does not change charIndex and returns 862 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 863 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 864 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 865 * also stops looking for a matching & when it encounters another ;. 866 */ 867 private byte skipEntityBackward() { 868 int initialCharIndex = charIndex; 869 while (charIndex > 0) { 870 lastChar = text.charAt(--charIndex); 871 if (lastChar == '&') { 872 return Character.DIRECTIONALITY_WHITESPACE; 873 } 874 if (lastChar == ';') { 875 break; 876 } 877 } 878 charIndex = initialCharIndex; 879 lastChar = ';'; 880 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 881 } 882 } 883}