1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ****************************************************************************** 6 * 7 * Copyright (C) 2009-2015, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 * 10 ****************************************************************************** 11 */ 12 13package android.icu.impl; 14 15import android.icu.text.UnicodeSet.SpanCondition; 16import android.icu.util.OutputInt; 17 18/** 19 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. 20 * 21 * Latin-1: Look up bytes. 22 * 2-byte characters: Bits organized vertically. 23 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. 24 * Supplementary characters: Call contains() on the parent set. 25 * @hide Only a subset of ICU is exposed in Android 26 */ 27public final class BMPSet { 28 public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000); 29 30 /** 31 * One boolean ('true' or 'false') per Latin-1 character. 32 */ 33 private boolean[] latin1Contains; 34 35 /** 36 * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points 37 * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} 38 * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) 39 * 40 * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at 41 * runtime. 42 */ 43 private int[] table7FF; 44 45 /** 46 * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks 47 * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} 48 * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit 49 * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed 50 * and set.contains(c) must be called. 51 * 52 * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster 53 * validity checking at runtime. 54 */ 55 private int[] bmpBlockBits; 56 57 /** 58 * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, 59 * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are 60 * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. 61 */ 62 private int[] list4kStarts; 63 64 /** 65 * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for 66 * supplementary code points. The list is terminated with list[listLength-1]=0x110000. 67 */ 68 private final int[] list; 69 private final int listLength; // length used; list may be longer to minimize reallocs 70 71 public BMPSet(final int[] parentList, int parentListLength) { 72 list = parentList; 73 listLength = parentListLength; 74 latin1Contains = new boolean[0x100]; 75 table7FF = new int[64]; 76 bmpBlockBits = new int[64]; 77 list4kStarts = new int[18]; 78 79 /* 80 * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the 81 * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of 82 * indexes is for finding supplementary code points. 83 */ 84 list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); 85 int i; 86 for (i = 1; i <= 0x10; ++i) { 87 list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); 88 } 89 list4kStarts[0x11] = listLength - 1; 90 91 initBits(); 92 } 93 94 public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) { 95 list = newParentList; 96 listLength = newParentListLength; 97 latin1Contains = otherBMPSet.latin1Contains.clone(); 98 table7FF = otherBMPSet.table7FF.clone(); 99 bmpBlockBits = otherBMPSet.bmpBlockBits.clone(); 100 list4kStarts = otherBMPSet.list4kStarts.clone(); 101 } 102 103 public boolean contains(int c) { 104 if (c <= 0xff) { 105 return (latin1Contains[c]); 106 } else if (c <= 0x7ff) { 107 return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); 108 } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { 109 int lead = c >> 12; 110 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 111 if (twoBits <= 1) { 112 // All 64 code points with the same bits 15..6 113 // are either in the set or not. 114 return (0 != twoBits); 115 } else { 116 // Look up the code point in its 4k block of code points. 117 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); 118 } 119 } else if (c <= 0x10ffff) { 120 // surrogate or supplementary code point 121 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 122 } else { 123 // Out-of-range code points get false, consistent with long-standing 124 // behavior of UnicodeSet.contains(c). 125 return false; 126 } 127 } 128 129 /** 130 * Span the initial substring for which each character c has spanCondition==contains(c). It must be 131 * spanCondition==0 or 1. 132 * 133 * @param start The start index 134 * @param outCount If not null: Receives the number of code points in the span. 135 * @return the limit (exclusive end) of the span 136 * 137 * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for 138 * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points 139 * as usual in ICU. 140 */ 141 public final int span(CharSequence s, int start, SpanCondition spanCondition, 142 OutputInt outCount) { 143 char c, c2; 144 int i = start; 145 int limit = s.length(); 146 int numSupplementary = 0; 147 if (SpanCondition.NOT_CONTAINED != spanCondition) { 148 // span 149 while (i < limit) { 150 c = s.charAt(i); 151 if (c <= 0xff) { 152 if (!latin1Contains[c]) { 153 break; 154 } 155 } else if (c <= 0x7ff) { 156 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 157 break; 158 } 159 } else if (c < 0xd800 || 160 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 161 int lead = c >> 12; 162 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 163 if (twoBits <= 1) { 164 // All 64 code points with the same bits 15..6 165 // are either in the set or not. 166 if (twoBits == 0) { 167 break; 168 } 169 } else { 170 // Look up the code point in its 4k block of code points. 171 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 172 break; 173 } 174 } 175 } else { 176 // surrogate pair 177 int supplementary = Character.toCodePoint(c, c2); 178 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 179 break; 180 } 181 ++numSupplementary; 182 ++i; 183 } 184 ++i; 185 } 186 } else { 187 // span not 188 while (i < limit) { 189 c = s.charAt(i); 190 if (c <= 0xff) { 191 if (latin1Contains[c]) { 192 break; 193 } 194 } else if (c <= 0x7ff) { 195 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 196 break; 197 } 198 } else if (c < 0xd800 || 199 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 200 int lead = c >> 12; 201 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 202 if (twoBits <= 1) { 203 // All 64 code points with the same bits 15..6 204 // are either in the set or not. 205 if (twoBits != 0) { 206 break; 207 } 208 } else { 209 // Look up the code point in its 4k block of code points. 210 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 211 break; 212 } 213 } 214 } else { 215 // surrogate pair 216 int supplementary = Character.toCodePoint(c, c2); 217 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 218 break; 219 } 220 ++numSupplementary; 221 ++i; 222 } 223 ++i; 224 } 225 } 226 if (outCount != null) { 227 int spanLength = i - start; 228 outCount.value = spanLength - numSupplementary; // number of code points 229 } 230 return i; 231 } 232 233 /** 234 * Symmetrical with span(). 235 * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= 236 * limit and spanCondition==0 or 1. 237 * 238 * @return The string index which starts the span (i.e. inclusive). 239 */ 240 public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { 241 char c, c2; 242 243 if (SpanCondition.NOT_CONTAINED != spanCondition) { 244 // span 245 for (;;) { 246 c = s.charAt(--limit); 247 if (c <= 0xff) { 248 if (!latin1Contains[c]) { 249 break; 250 } 251 } else if (c <= 0x7ff) { 252 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 253 break; 254 } 255 } else if (c < 0xd800 || 256 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 257 int lead = c >> 12; 258 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 259 if (twoBits <= 1) { 260 // All 64 code points with the same bits 15..6 261 // are either in the set or not. 262 if (twoBits == 0) { 263 break; 264 } 265 } else { 266 // Look up the code point in its 4k block of code points. 267 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 268 break; 269 } 270 } 271 } else { 272 // surrogate pair 273 int supplementary = Character.toCodePoint(c2, c); 274 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 275 break; 276 } 277 --limit; 278 } 279 if (0 == limit) { 280 return 0; 281 } 282 } 283 } else { 284 // span not 285 for (;;) { 286 c = s.charAt(--limit); 287 if (c <= 0xff) { 288 if (latin1Contains[c]) { 289 break; 290 } 291 } else if (c <= 0x7ff) { 292 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 293 break; 294 } 295 } else if (c < 0xd800 || 296 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 297 int lead = c >> 12; 298 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 299 if (twoBits <= 1) { 300 // All 64 code points with the same bits 15..6 301 // are either in the set or not. 302 if (twoBits != 0) { 303 break; 304 } 305 } else { 306 // Look up the code point in its 4k block of code points. 307 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 308 break; 309 } 310 } 311 } else { 312 // surrogate pair 313 int supplementary = Character.toCodePoint(c2, c); 314 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 315 break; 316 } 317 --limit; 318 } 319 if (0 == limit) { 320 return 0; 321 } 322 } 323 } 324 return limit + 1; 325 } 326 327 /** 328 * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 329 */ 330 private static void set32x64Bits(int[] table, int start, int limit) { 331 assert (64 == table.length); 332 int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 333 int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 334 335 // Set one bit indicating an all-one block. 336 int bits = 1 << lead; 337 if ((start + 1) == limit) { // Single-character shortcut. 338 table[trail] |= bits; 339 return; 340 } 341 342 int limitLead = limit >> 6; 343 int limitTrail = limit & 0x3f; 344 345 if (lead == limitLead) { 346 // Partial vertical bit column. 347 while (trail < limitTrail) { 348 table[trail++] |= bits; 349 } 350 } else { 351 // Partial vertical bit column, 352 // followed by a bit rectangle, 353 // followed by another partial vertical bit column. 354 if (trail > 0) { 355 do { 356 table[trail++] |= bits; 357 } while (trail < 64); 358 ++lead; 359 } 360 if (lead < limitLead) { 361 bits = ~((1 << lead) - 1); 362 if (limitLead < 0x20) { 363 bits &= (1 << limitLead) - 1; 364 } 365 for (trail = 0; trail < 64; ++trail) { 366 table[trail] |= bits; 367 } 368 } 369 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 370 // In that case, bits=1<<limitLead == 1<<0 == 1 371 // (because Java << uses only the lower 5 bits of the shift operand) 372 // but the bits value is not used because trail<limitTrail is already false. 373 bits = 1 << limitLead; 374 for (trail = 0; trail < limitTrail; ++trail) { 375 table[trail] |= bits; 376 } 377 } 378 } 379 380 private void initBits() { 381 int start, limit; 382 int listIndex = 0; 383 384 // Set latin1Contains[]. 385 do { 386 start = list[listIndex++]; 387 if (listIndex < listLength) { 388 limit = list[listIndex++]; 389 } else { 390 limit = 0x110000; 391 } 392 if (start >= 0x100) { 393 break; 394 } 395 do { 396 latin1Contains[start++] = true; 397 } while (start < limit && start < 0x100); 398 } while (limit <= 0x100); 399 400 // Set table7FF[]. 401 while (start < 0x800) { 402 set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); 403 if (limit > 0x800) { 404 start = 0x800; 405 break; 406 } 407 408 start = list[listIndex++]; 409 if (listIndex < listLength) { 410 limit = list[listIndex++]; 411 } else { 412 limit = 0x110000; 413 } 414 } 415 416 // Set bmpBlockBits[]. 417 int minStart = 0x800; 418 while (start < 0x10000) { 419 if (limit > 0x10000) { 420 limit = 0x10000; 421 } 422 423 if (start < minStart) { 424 start = minStart; 425 } 426 if (start < limit) { // Else: Another range entirely in a known mixed-value block. 427 if (0 != (start & 0x3f)) { 428 // Mixed-value block of 64 code points. 429 start >>= 6; 430 bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); 431 start = (start + 1) << 6; // Round up to the next block boundary. 432 minStart = start; // Ignore further ranges in this block. 433 } 434 if (start < limit) { 435 if (start < (limit & ~0x3f)) { 436 // Multiple all-ones blocks of 64 code points each. 437 set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); 438 } 439 440 if (0 != (limit & 0x3f)) { 441 // Mixed-value block of 64 code points. 442 limit >>= 6; 443 bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); 444 limit = (limit + 1) << 6; // Round up to the next block boundary. 445 minStart = limit; // Ignore further ranges in this block. 446 } 447 } 448 } 449 450 if (limit == 0x10000) { 451 break; 452 } 453 454 start = list[listIndex++]; 455 if (listIndex < listLength) { 456 limit = list[listIndex++]; 457 } else { 458 limit = 0x110000; 459 } 460 } 461 } 462 463 464 /** 465 * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code 466 * points in a certain range. 467 * 468 * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and 469 * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. 470 * 471 * @param c 472 * a character in a subrange of MIN_VALUE..MAX_VALUE 473 * @param lo 474 * The lowest index to be returned. 475 * @param hi 476 * The highest index to be returned. 477 * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] 478 */ 479 private int findCodePoint(int c, int lo, int hi) { 480 /* Examples: 481 findCodePoint(c) 482 set list[] c=0 1 3 4 7 8 483 === ============== =========== 484 [] [110000] 0 0 0 0 0 0 485 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 486 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 487 [:Any:] [0, 110000] 1 1 1 1 1 1 488 */ 489 490 // Return the smallest i such that c < list[i]. Assume 491 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 492 if (c < list[lo]) 493 return lo; 494 // High runner test. c is often after the last range, so an 495 // initial check for this condition pays off. 496 if (lo >= hi || c >= list[hi - 1]) 497 return hi; 498 // invariant: c >= list[lo] 499 // invariant: c < list[hi] 500 for (;;) { 501 int i = (lo + hi) >>> 1; 502 if (i == lo) { 503 break; // Found! 504 } else if (c < list[i]) { 505 hi = i; 506 } else { 507 lo = i; 508 } 509 } 510 return hi; 511 } 512 513 private final boolean containsSlow(int c, int lo, int hi) { 514 return (0 != (findCodePoint(c, lo, hi) & 1)); 515 } 516} 517 518