16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org****************************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2007-2012, International Business Machines 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Corporation and others. All Rights Reserved. 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org****************************************************************************** 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* file name: bmpset.cpp 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* encoding: US-ASCII 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* tab size: 8 (not used) 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* indentation:4 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created on: 2007jan29 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created by: Markus W. Scherer 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf8.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h" 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h" 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "bmpset.h" 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uassert.h" 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list(parentList), listLength(parentListLength) { 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memset(table7FF, 0, sizeof(table7FF)); 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Set the list indexes for binary searches for 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * U+0800, U+1000, U+2000, .., U+F000, U+10000. 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * looked up in the bit tables. 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The last pair of indexes is for finding supplementary code points. 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i; 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=1; i<=0x10; ++i) { 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list4kStarts[0x11]=listLength-1; 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org initBits(); 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org overrideIllegal(); 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org list(newParentList), listLength(newParentListLength) { 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::~BMPSet() { 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Set bits in a bit rectangle in "vertical" bit organization. 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * start<limit<=0x800 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U_ASSERT(start<limit); 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U_ASSERT(limit<=0x800); 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Set one bit indicating an all-one block. 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t bits=(uint32_t)1<<lead; 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((start+1)==limit) { // Single-character shortcut. 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table[trail]|=bits; 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limitLead=limit>>6; 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limitTrail=limit&0x3f; 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lead==limitLead) { 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Partial vertical bit column. 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(trail<limitTrail) { 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table[trail++]|=bits; 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Partial vertical bit column, 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // followed by a bit rectangle, 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // followed by another partial vertical bit column. 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(trail>0) { 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table[trail++]|=bits; 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(trail<64); 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++lead; 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(lead<limitLead) { 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bits=~((1<<lead)-1); 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(limitLead<0x20) { 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bits&=(1<<limitLead)-1; 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(trail=0; trail<64; ++trail) { 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table[trail]|=bits; 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // In that case, bits=1<<limitLead is undefined but the bits value 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // is not used because trail<limitTrail is already false. 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead); 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(trail=0; trail<limitTrail; ++trail) { 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table[trail]|=bits; 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid BMPSet::initBits() { 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 start, limit; 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t listIndex=0; 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Set asciiBytes[]. 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=list[listIndex++]; 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(listIndex<listLength) { 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=list[listIndex++]; 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=0x110000; 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start>=0x80) { 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org asciiBytes[start++]=1; 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(start<limit && start<0x80); 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(limit<=0x80); 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Set table7FF[]. 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(start<0x800) { 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(limit>0x800) { 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=0x800; 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=list[listIndex++]; 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(listIndex<listLength) { 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=list[listIndex++]; 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=0x110000; 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Set bmpBlockBits[]. 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t minStart=0x800; 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(start<0x10000) { 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(limit>0x10000) { 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=0x10000; 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start<minStart) { 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=minStart; 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start<limit) { // Else: Another range entirely in a known mixed-value block. 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start&0x3f) { 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Mixed-value block of 64 code points. 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start>>=6; 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=(start+1)<<6; // Round up to the next block boundary. 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org minStart=start; // Ignore further ranges in this block. 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start<limit) { 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start<(limit&~0x3f)) { 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Multiple all-ones blocks of 64 code points each. 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org set32x64Bits(bmpBlockBits, start>>6, limit>>6); 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(limit&0x3f) { 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Mixed-value block of 64 code points. 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit>>=6; 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=(limit+1)<<6; // Round up to the next block boundary. 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org minStart=limit; // Ignore further ranges in this block. 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(limit==0x10000) { 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=list[listIndex++]; 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(listIndex<listLength) { 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=list[listIndex++]; 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit=0x110000; 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Override some bits and bytes to the result of contains(FFFD) 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * for faster validity checking at runtime. 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * No need to set 0 values where they were reset to 0 in the constructor 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and not modified by initBits(). 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Need to set 0 values for surrogates D800..DFFF. 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid BMPSet::overrideIllegal() { 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t bits, mask; 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i; 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // contains(FFFD)==TRUE 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0x80; i<0xc0; ++i) { 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org asciiBytes[i]=1; 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bits=3; // Lead bytes 0xC0 and 0xC1. 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<64; ++i) { 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table7FF[i]|=bits; 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bits=1; // Lead byte 0xE0. 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=0; i<32; ++i) { // First half of 4k block. 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bmpBlockBits[i]|=bits; 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org mask=~(0x10001<<0xd); // Lead byte 0xED. 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bits=1<<0xd; 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=32; i<64; ++i) { // Second half of 4k block. 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // contains(FFFD)==FALSE 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org mask=~(0x10001<<0xd); // Lead byte 0xED. 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(i=32; i<64; ++i) { // Second half of 4k block. 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bmpBlockBits[i]&=mask; 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Examples: 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org findCodePoint(c) 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org set list[] c=0 1 3 4 7 8 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org === ============== =========== 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org [] [110000] 0 0 0 0 0 0 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org [:Any:] [0, 110000] 1 1 1 1 1 1 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Return the smallest i such that c < list[i]. Assume 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (c < list[lo]) 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return lo; 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // High runner test. c is often after the last range, so an 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // initial check for this condition pays off. 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (lo >= hi || c >= list[hi-1]) 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return hi; 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // invariant: c >= list[lo] 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // invariant: c < list[hi] 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (;;) { 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i = (lo + hi) >> 1; 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (i == lo) { 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; // Found! 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if (c < list[i]) { 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org hi = i; 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lo = i; 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return hi; 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::contains(UChar32 c) const { 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((uint32_t)c<=0x7f) { 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (UBool)asciiBytes[c]; 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if((uint32_t)c<=0x7ff) { 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int lead=c>>12; 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with the same bits 15..6 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (UBool)twoBits; 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if((uint32_t)c<=0x10ffff) { 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate or supplementary code point 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Out-of-range code points get FALSE, consistent with long-standing 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // behavior of UnicodeSet::contains(c). 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return FALSE; 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Check for sufficient length for trail unit for each surrogate pair. 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Handle single surrogates as surrogate code points as usual in ICU. 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst UChar * 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar c, c2; 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition) { 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // span 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c=*s; 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<=0x7f) { 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!asciiBytes[c]) { 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<=0x7ff) { 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<0xd800 || c>=0xe000) { 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int lead=c>>12; 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with the same bits 15..6 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits==0) { 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate code point 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate pair 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++s; 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(++s<limit); 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // span not 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c=*s; 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<=0x7f) { 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[c]) { 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<=0x7ff) { 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<0xd800 || c>=0xe000) { 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int lead=c>>12; 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with the same bits 15..6 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits!=0) { 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate code point 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate pair 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++s; 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(++s<limit); 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* Symmetrical with span(). */ 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst UChar * 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar c, c2; 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition) { 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // span 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c=*(--limit); 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<=0x7f) { 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!asciiBytes[c]) { 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<=0x7ff) { 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<0xd800 || c>=0xe000) { 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int lead=c>>12; 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with the same bits 15..6 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits==0) { 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate code point 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate pair 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --limit; 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(s==limit) { 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // span not 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(;;) { 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c=*(--limit); 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<=0x7f) { 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[c]) { 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<=0x7ff) { 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<0xd800 || c>=0xe000) { 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int lead=c>>12; 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with the same bits 15..6 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits!=0) { 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate code point 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // surrogate pair 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --limit; 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(s==limit) { 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return limit+1; 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Precheck for sufficient trail bytes at end of string only once per span. 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Check validity. 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst uint8_t * 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *limit=s+length; 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t b=*s; 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((int8_t)b>=0) { 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Initial all-ASCII span. 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition) { 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!asciiBytes[b] || ++s==limit) { 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=*s; 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while((int8_t)b>=0); 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[b] || ++s==limit) { 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=*s; 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while((int8_t)b>=0); 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org length=(int32_t)(limit-s); 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *limit0=limit; 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Make sure that the last 1/2/3/4-byte sequence before limit is complete 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * or runs into a lead byte. 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * In the span loop compare s with limit only once 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * per multi-byte character. 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Give a trailing illegal sequence the same value as the result of contains(FFFD), 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * including it if that is part of the span, otherwise set limit0 to before 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the truncated sequence. 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=*(limit-1); 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((int8_t)b<0) { 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // b>=0x80: lead or trail byte 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(b<0xc0) { 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // single trail byte, check for preceding 3- or 4-byte lead byte 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(length>=2 && (b=*(limit-2))>=0xe0) { 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit-=2; 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[0x80]!=spanCondition) { 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit0=limit; 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 4-byte lead byte with only two trail bytes 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit-=3; 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[0x80]!=spanCondition) { 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit0=limit; 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // lead byte with no trail bytes 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org --limit; 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[0x80]!=spanCondition) { 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit0=limit; 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t t1, t2, t3; 5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(s<limit) { 5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=*s; 5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(b<0xc0) { 5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ASCII; or trail bytes with the result of contains(FFFD). 5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition) { 5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!asciiBytes[b]) { 5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(++s==limit) { 5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return limit0; 5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=*s; 5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(b<0xc0); 5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[b]) { 5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s; 5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(++s==limit) { 5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return limit0; 5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=*s; 5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(b<0xc0); 5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++s; // Advance past the lead byte. 5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(b>=0xe0) { 5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(b<0xf0) { 5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( /* handle U+0000..U+FFFF inline */ 5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (t2=(uint8_t)(s[1]-0x80)) <= 0x3f 5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b&=0xf; 5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; 5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with this lead byte and middle trail byte 5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits!=(uint32_t)spanCondition) { 5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s-1; 6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=(b<<12)|(t1<<6)|t2; 6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { 6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s-1; 6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s+=2; 6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if( /* handle U+10000..U+10FFFF inline */ 6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && 6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (t3=(uint8_t)(s[2]-0x80)) <= 0x3f 6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Give an illegal sequence the same value as the result of contains(FFFD). 6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; 6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( ( (0x10000<=c && c<=0x10ffff) ? 6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : 6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org asciiBytes[0x80] 6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) != spanCondition 6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s-1; 6246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s+=3; 6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else /* 0xc0<=b<0xe0 */ { 6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if( /* handle U+0000..U+07FF inline */ 6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (t1=(uint8_t)(*s-0x80)) <= 0x3f 6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ) { 6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { 6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s-1; 6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++s; 6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Give an illegal sequence the same value as the result of contains(FFFD). 6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Handle each byte of an illegal sequence separately to simplify the code; 6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // no need to optimize error handling. 6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[0x80]!=spanCondition) { 6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return s-1; 6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return limit0; 6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * While going backwards through UTF-8 optimize only for ASCII. 6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not 6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * possible to tell from the last byte in a multi-byte sequence how many 6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * preceding bytes there should be. Therefore, going backwards through UTF-8 6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * is much harder than going forward. 6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t 6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t b; 6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=s[--length]; 6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((int8_t)b>=0) { 6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // ASCII sub-span 6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(spanCondition) { 6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(!asciiBytes[b]) { 6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length+1; 6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(length==0) { 6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=s[--length]; 6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while((int8_t)b>=0); 6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(asciiBytes[b]) { 6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return length+1; 6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(length==0) { 6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org b=s[--length]; 6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while((int8_t)b>=0); 6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t prev=length; 6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c; 6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // trail byte: collect a multi-byte character 6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (or lead byte in last-trail position) 6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c=utf8_prevCharSafeBody(s, 0, &length, b, -3); 6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // c is a valid code point, not ASCII, not a surrogate 6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(c<=0x7ff) { 6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { 6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return prev+1; 7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(c<=0xffff) { 7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int lead=c>>12; 7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits<=1) { 7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // All 64 code points with the same bits 15..6 7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are either in the set or not. 7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(twoBits!=(uint32_t)spanCondition) { 7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return prev+1; 7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look up the code point in its 4k block of code points. 7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { 7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return prev+1; 7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { 7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return prev+1; 7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(length>0); 7226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 7236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 7246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 726