1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.support.v4.text; 18 19import android.support.v4.view.ViewCompat; 20import android.text.SpannableStringBuilder; 21 22import java.util.Locale; 23 24import static android.support.v4.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR; 25 26/** 27 * Utility class for formatting text for display in a potentially opposite-directionality context 28 * without garbling. The directionality of the context is set at formatter creation and the 29 * directionality of the text can be either estimated or passed in when known. Provides the 30 * following functionality: 31 * <p> 32 * 1. Bidi Wrapping 33 * When text in one language is mixed into a document in another, opposite-directionality language, 34 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string 35 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 36 * separated from the surrounding text in a "wrapper" that: 37 * <p> 38 * - Declares its directionality so that the string is displayed correctly. This can be done in 39 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 40 * <p> 41 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 42 * Currently, this can only be done using invisible Unicode characters of the same direction as 43 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 44 * the directionality to that of the context. The "reset" may need to be done at both ends of the 45 * string. Without "reset" after the string, the string will "stick" to a number or logically 46 * separate opposite-direction text that happens to follow it in-line (even if separated by 47 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 48 * happen there, but only with more opposite-direction text, not a number. One approach is to 49 * "reset" the direction only after each string, on the theory that if the preceding opposite- 50 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 51 * the "reset" only before each string definitely does not work because we do not want to require 52 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 53 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 54 * message translations often contain untranslated Latin-script brand names and technical terms, 55 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 56 * has such a message, it is best to do the "reset" manually in the message translation itself, 57 * since the message's opposite-direction text could be followed by an inserted number, which we 58 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 59 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 60 * isolation to be part of the directionality declaration. This form of isolation is better than 61 * "reset" because it takes less space, does not require knowing the context directionality, has a 62 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 63 * using it because required platforms do not yet support it. 64 * <p> 65 * Providing these wrapping services is the basic purpose of the bidi formatter. 66 * <p> 67 * 2. Directionality estimation 68 * How does one know whether a string about to be inserted into surrounding text has the same 69 * directionality? Well, in many cases, one knows that this must be the case when writing the code 70 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 71 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 72 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 73 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 74 * language of the string (and thus its directionality) is not known a priori, and must be 75 * estimated at run-time. The bidi formatter can do this automatically using the default 76 * first-strong estimation algorithm. It can also be configured to use a custom directionality 77 * estimation object. 78 */ 79public final class BidiFormatter { 80 81 /** 82 * The default text direction heuristic. 83 */ 84 private static TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 85 86 /** 87 * Unicode "Left-To-Right Embedding" (LRE) character. 88 */ 89 private static final char LRE = '\u202A'; 90 91 /** 92 * Unicode "Right-To-Left Embedding" (RLE) character. 93 */ 94 private static final char RLE = '\u202B'; 95 96 /** 97 * Unicode "Pop Directional Formatting" (PDF) character. 98 */ 99 private static final char PDF = '\u202C'; 100 101 /** 102 * Unicode "Left-To-Right Mark" (LRM) character. 103 */ 104 private static final char LRM = '\u200E'; 105 106 /* 107 * Unicode "Right-To-Left Mark" (RLM) character. 108 */ 109 private static final char RLM = '\u200F'; 110 111 /* 112 * String representation of LRM 113 */ 114 private static final String LRM_STRING = Character.toString(LRM); 115 116 /* 117 * String representation of RLM 118 */ 119 private static final String RLM_STRING = Character.toString(RLM); 120 121 /** 122 * Empty string constant. 123 */ 124 private static final String EMPTY_STRING = ""; 125 126 /** 127 * A class for building a BidiFormatter with non-default options. 128 */ 129 public static final class Builder { 130 private boolean mIsRtlContext; 131 private int mFlags; 132 private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat; 133 134 /** 135 * Constructor. 136 * 137 */ 138 public Builder() { 139 initialize(isRtlLocale(Locale.getDefault())); 140 } 141 142 /** 143 * Constructor. 144 * 145 * @param rtlContext Whether the context directionality is RTL. 146 */ 147 public Builder(boolean rtlContext) { 148 initialize(rtlContext); 149 } 150 151 /** 152 * Constructor. 153 * 154 * @param locale The context locale. 155 */ 156 public Builder(Locale locale) { 157 initialize(isRtlLocale(locale)); 158 } 159 160 /** 161 * Initializes the builder with the given context directionality and default options. 162 * 163 * @param isRtlContext Whether the context is RTL or not. 164 */ 165 private void initialize(boolean isRtlContext) { 166 mIsRtlContext = isRtlContext; 167 mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC; 168 mFlags = DEFAULT_FLAGS; 169 } 170 171 /** 172 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 173 * a string being bidi-wrapped, not just after it. The default is true. 174 */ 175 public Builder stereoReset(boolean stereoReset) { 176 if (stereoReset) { 177 mFlags |= FLAG_STEREO_RESET; 178 } else { 179 mFlags &= ~FLAG_STEREO_RESET; 180 } 181 return this; 182 } 183 184 /** 185 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 186 * By default, uses the first-strong heuristic. 187 * 188 * @param heuristic the {@code TextDirectionHeuristic} to use. 189 * @return the builder itself. 190 */ 191 public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) { 192 mTextDirectionHeuristicCompat = heuristic; 193 return this; 194 } 195 196 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 197 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 198 } 199 200 /** 201 * @return A BidiFormatter with the specified options. 202 */ 203 public BidiFormatter build() { 204 if (mFlags == DEFAULT_FLAGS && 205 mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 206 return getDefaultInstanceFromContext(mIsRtlContext); 207 } 208 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat); 209 } 210 } 211 212 // 213 private static final int FLAG_STEREO_RESET = 2; 214 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 215 216 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 217 false /* LTR context */, 218 DEFAULT_FLAGS, 219 DEFAULT_TEXT_DIRECTION_HEURISTIC); 220 221 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 222 true /* RTL context */, 223 DEFAULT_FLAGS, 224 DEFAULT_TEXT_DIRECTION_HEURISTIC); 225 226 private final boolean mIsRtlContext; 227 private final int mFlags; 228 private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat; 229 230 /** 231 * Factory for creating an instance of BidiFormatter for the default locale directionality. 232 * 233 */ 234 public static BidiFormatter getInstance() { 235 return new Builder().build(); 236 } 237 238 /** 239 * Factory for creating an instance of BidiFormatter given the context directionality. 240 * 241 * @param rtlContext Whether the context directionality is RTL. 242 */ 243 public static BidiFormatter getInstance(boolean rtlContext) { 244 return new Builder(rtlContext).build(); 245 } 246 247 /** 248 * Factory for creating an instance of BidiFormatter given the context locale. 249 * 250 * @param locale The context locale. 251 */ 252 public static BidiFormatter getInstance(Locale locale) { 253 return new Builder(locale).build(); 254 } 255 256 /** 257 * @param isRtlContext Whether the context directionality is RTL or not. 258 * @param flags The option flags. 259 * @param heuristic The default text direction heuristic. 260 */ 261 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) { 262 mIsRtlContext = isRtlContext; 263 mFlags = flags; 264 mDefaultTextDirectionHeuristicCompat = heuristic; 265 } 266 267 /** 268 * @return Whether the context directionality is RTL 269 */ 270 public boolean isRtlContext() { 271 return mIsRtlContext; 272 } 273 274 /** 275 * @return Whether directionality "reset" should also be done before a string being 276 * bidi-wrapped, not just after it. 277 */ 278 public boolean getStereoReset() { 279 return (mFlags & FLAG_STEREO_RESET) != 0; 280 } 281 282 /** 283 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 284 * overall or the exit directionality of a given CharSequence is opposite to the context 285 * directionality. Putting this after the CharSequence (including its directionality 286 * declaration wrapping) prevents it from "sticking" to other opposite-directionality text or a 287 * number appearing after it inline with only neutral content in between. Otherwise returns 288 * the empty string. While the exit directionality is determined by scanning the end of the 289 * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the 290 * {@code str}'s directionality. 291 * 292 * @param str CharSequence after which the mark may need to appear. 293 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 294 * directionality. 295 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 296 * else, the empty . 297 */ 298 private String markAfter(CharSequence str, TextDirectionHeuristicCompat heuristic) { 299 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 300 // getExitDir() is called only if needed (short-circuit). 301 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 302 return LRM_STRING; 303 } 304 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 305 return RLM_STRING; 306 } 307 return EMPTY_STRING; 308 } 309 310 /** 311 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 312 * overall or the entry directionality of a given CharSequence is opposite to the context 313 * directionality. Putting this before the CharSequence (including its directionality 314 * declaration wrapping) prevents it from "sticking" to other opposite-directionality text 315 * appearing before it inline with only neutral content in between. Otherwise returns the 316 * empty string. While the entry directionality is determined by scanning the beginning of the 317 * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the 318 * {@code str}'s directionality. 319 * 320 * @param str CharSequence before which the mark may need to appear. 321 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 322 * directionality. 323 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 324 * else, the empty string. 325 */ 326 private String markBefore(CharSequence str, TextDirectionHeuristicCompat heuristic) { 327 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 328 // getEntryDir() is called only if needed (short-circuit). 329 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 330 return LRM_STRING; 331 } 332 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 333 return RLM_STRING; 334 } 335 return EMPTY_STRING; 336 } 337 338 /** 339 * Estimates the directionality of a string using the default text direction heuristic. 340 * 341 * @param str String whose directionality is to be estimated. 342 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 343 * false. 344 */ 345 public boolean isRtl(String str) { 346 return isRtl((CharSequence) str); 347 } 348 349 /** 350 * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string. 351 * 352 * @param str CharSequence whose directionality is to be estimated. 353 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 354 * false. 355 */ 356 public boolean isRtl(CharSequence str) { 357 return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length()); 358 } 359 360 /** 361 * Formats a string of given directionality for use in plain-text output of the context 362 * directionality, so an opposite-directionality string is neither garbled nor garbles its 363 * surroundings. This makes use of Unicode bidi formatting characters. 364 * <p> 365 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 366 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 367 * LRE+{@code str}+PDF for LTR text. 368 * <p> 369 * If {@code isolate}, directionally isolates the string so that it does not garble its 370 * surroundings. Currently, this is done by "resetting" the directionality after the string by 371 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 372 * either the overall directionality or the exit directionality of the string is opposite to 373 * that of the context. Unless the formatter was built using 374 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 375 * bidi mark matching the context directionality when either the overall directionality or the 376 * entry directionality of the string is opposite to that of the context. Note that as opposed 377 * to the overall directionality, the entry and exit directionalities are determined from the 378 * string itself. 379 * <p> 380 * Does *not* do HTML-escaping. 381 * 382 * @param str The input string. 383 * @param heuristic The algorithm to be used to estimate the string's overall direction. 384 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 385 * content around it 386 * @return Input string after applying the above processing. {@code null} if {@code str} is 387 * {@code null}. 388 */ 389 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) { 390 if (str == null) return null; 391 return unicodeWrap((CharSequence) str, heuristic, isolate).toString(); 392 } 393 394 /** 395 * Operates like {@link #unicodeWrap(String, 396 * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but takes a CharSequence 397 * instead of a string 398 * 399 * @param str The input CharSequence. 400 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 401 * See {@link android.support.v4.text.TextDirectionHeuristicsCompat} for pre-defined 402 * heuristics. 403 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 404 * the content around it 405 * @return Input CharSequence after applying the above processing. {@code null} if {@code str} 406 * is {@code null}. 407 */ 408 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic, 409 boolean isolate) { 410 if (str == null) return null; 411 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 412 SpannableStringBuilder result = new SpannableStringBuilder(); 413 if (getStereoReset() && isolate) { 414 result.append(markBefore(str, 415 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 416 } 417 if (isRtl != mIsRtlContext) { 418 result.append(isRtl ? RLE : LRE); 419 result.append(str); 420 result.append(PDF); 421 } else { 422 result.append(str); 423 } 424 if (isolate) { 425 result.append(markAfter(str, 426 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 427 } 428 return result; 429 } 430 431 /** 432 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes 433 * {@code isolate} is true. 434 * 435 * @param str The input string. 436 * @param heuristic The algorithm to be used to estimate the string's overall direction. 437 * @return Input string after applying the above processing. 438 */ 439 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) { 440 return unicodeWrap(str, heuristic, true /* isolate */); 441 } 442 443 /** 444 * Operates like {@link #unicodeWrap(CharSequence, 445 * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes {@code isolate} 446 * is true. 447 * 448 * @param str The input CharSequence. 449 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 450 * See {@link android.support.v4.text.TextDirectionHeuristicsCompat} for pre-defined 451 * heuristics. 452 * @return Input CharSequence after applying the above processing. 453 */ 454 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic) { 455 return unicodeWrap(str, heuristic, true /* isolate */); 456 } 457 458 /** 459 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the 460 * formatter's default direction estimation algorithm. 461 * 462 * @param str The input string. 463 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 464 * content around it 465 * @return Input string after applying the above processing. 466 */ 467 public String unicodeWrap(String str, boolean isolate) { 468 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate); 469 } 470 471 /** 472 * Operates like {@link #unicodeWrap(CharSequence, 473 * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's 474 * default direction estimation algorithm. 475 * 476 * @param str The input CharSequence. 477 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 478 * the content around it 479 * @return Input CharSequence after applying the above processing. 480 */ 481 public CharSequence unicodeWrap(CharSequence str, boolean isolate) { 482 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate); 483 } 484 485 /** 486 * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the 487 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 488 * 489 * @param str The input string. 490 * @return Input string after applying the above processing. 491 */ 492 public String unicodeWrap(String str) { 493 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */); 494 } 495 496 /** 497 * Operates like {@link #unicodeWrap(CharSequence, 498 * android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's 499 * default direction estimation algorithm and assumes {@code isolate} is true. 500 * 501 * @param str The input CharSequence. 502 * @return Input CharSequence after applying the above processing. 503 */ 504 public CharSequence unicodeWrap(CharSequence str) { 505 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */); 506 } 507 508 /** 509 * Helper method to return true if the Locale directionality is RTL. 510 * 511 * @param locale The Locale whose directionality will be checked to be RTL or LTR 512 * @return true if the {@code locale} directionality is RTL. False otherwise. 513 */ 514 private static boolean isRtlLocale(Locale locale) { 515 return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL); 516 } 517 518 /** 519 * Enum for directionality type. 520 */ 521 private static final int DIR_LTR = -1; 522 private static final int DIR_UNKNOWN = 0; 523 private static final int DIR_RTL = +1; 524 525 /** 526 * Returns the directionality of the last character with strong directionality in the string, or 527 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 528 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 529 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 530 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 531 * whether a logically separate item that starts with a number or a character of the string's 532 * exit directionality and follows this string inline (not counting any neutral characters in 533 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 534 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 535 * between the two will prevent such sticking. 536 * 537 * @param str the string to check. 538 */ 539 private static int getExitDir(CharSequence str) { 540 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 541 } 542 543 /** 544 * Returns the directionality of the first character with strong directionality in the string, 545 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 546 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 547 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 548 * characters. The intended use is to check whether a logically separate item that ends with a 549 * character of the string's entry directionality and precedes the string inline (not counting 550 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 551 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 552 * context's directionality) between the two will prevent such sticking. 553 * 554 * @param str the string to check. 555 */ 556 private static int getEntryDir(CharSequence str) { 557 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 558 } 559 560 /** 561 * An object that estimates the directionality of a given string by various methods. 562 * 563 */ 564 private static class DirectionalityEstimator { 565 566 // Internal static variables and constants. 567 568 /** 569 * Size of the bidi character class cache. The results of the Character.getDirectionality() 570 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 571 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 572 * cache. It can be reduced to 0x180, restricting the cache to the Western European 573 * languages. 574 */ 575 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 576 577 /** 578 * The bidi character class cache. 579 */ 580 private static final byte DIR_TYPE_CACHE[]; 581 582 static { 583 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 584 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 585 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 586 } 587 } 588 589 // Internal instance variables. 590 591 /** 592 * The text to be scanned. 593 */ 594 private final CharSequence text; 595 596 /** 597 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 598 * entities when looking for the next / preceding dir type. 599 */ 600 private final boolean isHtml; 601 602 /** 603 * The length of the text in chars. 604 */ 605 private final int length; 606 607 /** 608 * The current position in the text. 609 */ 610 private int charIndex; 611 612 /** 613 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 614 * encountered a supplementary codepoint, this contains a char that is not a valid 615 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 616 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 617 */ 618 private char lastChar; 619 620 /** 621 * Constructor. 622 * 623 * @param text The string to scan. 624 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 625 * tags and entities. 626 */ 627 DirectionalityEstimator(CharSequence text, boolean isHtml) { 628 this.text = text; 629 this.isHtml = isHtml; 630 length = text.length(); 631 } 632 633 /** 634 * Returns the directionality of the first character with strong directionality in the 635 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 636 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 637 * after RLE/RLO. The results are undefined for a string containing unbalanced 638 * LRE/RLE/LRO/RLO/PDF characters. 639 */ 640 int getEntryDir() { 641 // The reason for this method name, as opposed to getFirstStrongDir(), is that 642 // "first strong" is a commonly used description of Unicode's estimation algorithm, 643 // but the two must treat formatting characters quite differently. Thus, we are staying 644 // away from both "first" and "last" in these method names to avoid confusion. 645 charIndex = 0; 646 int embeddingLevel = 0; 647 int embeddingLevelDir = DIR_UNKNOWN; 648 int firstNonEmptyEmbeddingLevel = 0; 649 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 650 switch (dirTypeForward()) { 651 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 652 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 653 ++embeddingLevel; 654 embeddingLevelDir = DIR_LTR; 655 break; 656 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 657 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 658 ++embeddingLevel; 659 embeddingLevelDir = DIR_RTL; 660 break; 661 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 662 --embeddingLevel; 663 // To restore embeddingLevelDir to its previous value, we would need a 664 // stack, which we want to avoid. Thus, at this point we do not know the 665 // current embedding's directionality. 666 embeddingLevelDir = DIR_UNKNOWN; 667 break; 668 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 669 break; 670 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 671 if (embeddingLevel == 0) { 672 return DIR_LTR; 673 } 674 firstNonEmptyEmbeddingLevel = embeddingLevel; 675 break; 676 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 677 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 678 if (embeddingLevel == 0) { 679 return DIR_RTL; 680 } 681 firstNonEmptyEmbeddingLevel = embeddingLevel; 682 break; 683 default: 684 firstNonEmptyEmbeddingLevel = embeddingLevel; 685 break; 686 } 687 } 688 689 // We have either found a non-empty embedding or scanned the entire string finding 690 // neither a non-empty embedding nor a strong character outside of an embedding. 691 if (firstNonEmptyEmbeddingLevel == 0) { 692 // We have not found a non-empty embedding. Thus, the string contains neither a 693 // non-empty embedding nor a strong character outside of an embedding. 694 return DIR_UNKNOWN; 695 } 696 697 // We have found a non-empty embedding. 698 if (embeddingLevelDir != DIR_UNKNOWN) { 699 // We know the directionality of the non-empty embedding. 700 return embeddingLevelDir; 701 } 702 703 // We do not remember the directionality of the non-empty embedding we found. So, we go 704 // backwards to find the start of the non-empty embedding and get its directionality. 705 while (charIndex > 0) { 706 switch (dirTypeBackward()) { 707 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 708 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 709 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 710 return DIR_LTR; 711 } 712 --embeddingLevel; 713 break; 714 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 715 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 716 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 717 return DIR_RTL; 718 } 719 --embeddingLevel; 720 break; 721 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 722 ++embeddingLevel; 723 break; 724 } 725 } 726 // We should never get here. 727 return DIR_UNKNOWN; 728 } 729 730 /** 731 * Returns the directionality of the last character with strong directionality in the 732 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 733 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 734 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 735 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 736 */ 737 int getExitDir() { 738 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 739 // strong" sounds like the exact opposite of "first strong", which is a commonly used 740 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 741 // must treat formatting characters quite differently. Thus, we are staying away from 742 // both "first" and "last" in these method names to avoid confusion. 743 charIndex = length; 744 int embeddingLevel = 0; 745 int lastNonEmptyEmbeddingLevel = 0; 746 while (charIndex > 0) { 747 switch (dirTypeBackward()) { 748 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 749 if (embeddingLevel == 0) { 750 return DIR_LTR; 751 } 752 if (lastNonEmptyEmbeddingLevel == 0) { 753 lastNonEmptyEmbeddingLevel = embeddingLevel; 754 } 755 break; 756 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 757 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 758 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 759 return DIR_LTR; 760 } 761 --embeddingLevel; 762 break; 763 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 764 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 765 if (embeddingLevel == 0) { 766 return DIR_RTL; 767 } 768 if (lastNonEmptyEmbeddingLevel == 0) { 769 lastNonEmptyEmbeddingLevel = embeddingLevel; 770 } 771 break; 772 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 773 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 774 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 775 return DIR_RTL; 776 } 777 --embeddingLevel; 778 break; 779 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 780 ++embeddingLevel; 781 break; 782 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 783 break; 784 default: 785 if (lastNonEmptyEmbeddingLevel == 0) { 786 lastNonEmptyEmbeddingLevel = embeddingLevel; 787 } 788 break; 789 } 790 } 791 return DIR_UNKNOWN; 792 } 793 794 // Internal methods 795 796 /** 797 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 798 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 799 * cache. 800 */ 801 private static byte getCachedDirectionality(char c) { 802 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c); 803 } 804 805 /** 806 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 807 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 808 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 809 * figure out the actual character, and return its dirtype, but treating it as whitespace is 810 * good enough for our purposes. 811 * 812 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 813 */ 814 byte dirTypeForward() { 815 lastChar = text.charAt(charIndex); 816 if (Character.isHighSurrogate(lastChar)) { 817 int codePoint = Character.codePointAt(text, charIndex); 818 charIndex += Character.charCount(codePoint); 819 return Character.getDirectionality(codePoint); 820 } 821 charIndex++; 822 byte dirType = getCachedDirectionality(lastChar); 823 if (isHtml) { 824 // Process tags and entities. 825 if (lastChar == '<') { 826 dirType = skipTagForward(); 827 } else if (lastChar == '&') { 828 dirType = skipEntityForward(); 829 } 830 } 831 return dirType; 832 } 833 834 /** 835 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 836 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 837 * entity, advances over the whole tag/entity and returns 838 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 839 * actual character, and return its dirtype, but treating it as whitespace is good enough 840 * for our purposes. 841 * 842 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 843 */ 844 byte dirTypeBackward() { 845 lastChar = text.charAt(charIndex - 1); 846 if (Character.isLowSurrogate(lastChar)) { 847 int codePoint = Character.codePointBefore(text, charIndex); 848 charIndex -= Character.charCount(codePoint); 849 return Character.getDirectionality(codePoint); 850 } 851 charIndex--; 852 byte dirType = getCachedDirectionality(lastChar); 853 if (isHtml) { 854 // Process tags and entities. 855 if (lastChar == '>') { 856 dirType = skipTagBackward(); 857 } else if (lastChar == ';') { 858 dirType = skipEntityBackward(); 859 } 860 } 861 return dirType; 862 } 863 864 /** 865 * Advances charIndex forward through an HTML tag (after the opening < has already been 866 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 867 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 868 * < that hadn't been part of a tag after all). 869 */ 870 private byte skipTagForward() { 871 int initialCharIndex = charIndex; 872 while (charIndex < length) { 873 lastChar = text.charAt(charIndex++); 874 if (lastChar == '>') { 875 // The end of the tag. 876 return Character.DIRECTIONALITY_WHITESPACE; 877 } 878 if (lastChar == '"' || lastChar == '\'') { 879 // Skip over a quoted attribute value inside the tag. 880 char quote = lastChar; 881 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 882 } 883 } 884 // The original '<' wasn't the start of a tag after all. 885 charIndex = initialCharIndex; 886 lastChar = '<'; 887 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 888 } 889 890 /** 891 * Advances charIndex backward through an HTML tag (after the closing > has already been 892 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 893 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 894 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 895 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 896 * ">>>>", because skipTagBackward() also stops looking for a matching < 897 * when it encounters another >. 898 */ 899 private byte skipTagBackward() { 900 int initialCharIndex = charIndex; 901 while (charIndex > 0) { 902 lastChar = text.charAt(--charIndex); 903 if (lastChar == '<') { 904 // The start of the tag. 905 return Character.DIRECTIONALITY_WHITESPACE; 906 } 907 if (lastChar == '>') { 908 break; 909 } 910 if (lastChar == '"' || lastChar == '\'') { 911 // Skip over a quoted attribute value inside the tag. 912 char quote = lastChar; 913 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 914 } 915 } 916 // The original '>' wasn't the end of a tag after all. 917 charIndex = initialCharIndex; 918 lastChar = '>'; 919 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 920 } 921 922 /** 923 * Advances charIndex forward through an HTML character entity tag (after the opening 924 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 925 * best to figure out the actual character and return its dirtype, but this is good enough. 926 */ 927 private byte skipEntityForward() { 928 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 929 return Character.DIRECTIONALITY_WHITESPACE; 930 } 931 932 /** 933 * Advances charIndex backward through an HTML character entity tag (after the closing ; 934 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 935 * to figure out the actual character and return its dirtype, but this is good enough. 936 * If there is no matching &, does not change charIndex and returns 937 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 938 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 939 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 940 * also stops looking for a matching & when it encounters another ;. 941 */ 942 private byte skipEntityBackward() { 943 int initialCharIndex = charIndex; 944 while (charIndex > 0) { 945 lastChar = text.charAt(--charIndex); 946 if (lastChar == '&') { 947 return Character.DIRECTIONALITY_WHITESPACE; 948 } 949 if (lastChar == ';') { 950 break; 951 } 952 } 953 charIndex = initialCharIndex; 954 lastChar = ';'; 955 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 956 } 957 } 958}