1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* Copyright (C) 2007-2012, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: bmpset.cpp 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007jan29 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus W. Scherer 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 19103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf8.h" 20103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf16.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "bmpset.h" 23103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "uassert.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list(parentList), listLength(parentListLength) { 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(table7FF, 0, sizeof(table7FF)); 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Set the list indexes for binary searches for 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U+0800, U+1000, U+2000, .., U+F000, U+10000. 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * looked up in the bit tables. 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The last pair of indexes is for finding supplementary code points. 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i; 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=1; i<=0x10; ++i) { 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list4kStarts[0x11]=listLength-1; 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru initBits(); 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru overrideIllegal(); 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru list(newParentList), listLength(newParentListLength) { 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::~BMPSet() { 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Set bits in a bit rectangle in "vertical" bit organization. 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * start<limit<=0x800 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { 67103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius U_ASSERT(start<limit); 68103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius U_ASSERT(limit<=0x800); 69103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 70103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 71103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set one bit indicating an all-one block. 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t bits=(uint32_t)1<<lead; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((start+1)==limit) { // Single-character shortcut. 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru table[trail]|=bits; 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t limitLead=limit>>6; 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t limitTrail=limit&0x3f; 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(lead==limitLead) { 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Partial vertical bit column. 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(trail<limitTrail) { 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru table[trail++]|=bits; 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Partial vertical bit column, 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // followed by a bit rectangle, 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // followed by another partial vertical bit column. 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(trail>0) { 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru table[trail++]|=bits; 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(trail<64); 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++lead; 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(lead<limitLead) { 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=~((1<<lead)-1); 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(limitLead<0x20) { 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits&=(1<<limitLead)-1; 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(trail=0; trail<64; ++trail) { 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru table[trail]|=bits; 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 107103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 108103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // In that case, bits=1<<limitLead is undefined but the bits value 109103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // is not used because trail<limitTrail is already false. 11054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead); 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(trail=0; trail<limitTrail; ++trail) { 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru table[trail]|=bits; 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid BMPSet::initBits() { 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 start, limit; 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t listIndex=0; 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set asciiBytes[]. 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=list[listIndex++]; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(listIndex<listLength) { 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=list[listIndex++]; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=0x110000; 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start>=0x80) { 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru asciiBytes[start++]=1; 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(start<limit && start<0x80); 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(limit<=0x80); 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set table7FF[]. 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(start<0x800) { 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(limit>0x800) { 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=0x800; 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=list[listIndex++]; 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(listIndex<listLength) { 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=list[listIndex++]; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=0x110000; 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set bmpBlockBits[]. 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t minStart=0x800; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(start<0x10000) { 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(limit>0x10000) { 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=0x10000; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start<minStart) { 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=minStart; 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start<limit) { // Else: Another range entirely in a known mixed-value block. 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start&0x3f) { 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Mixed-value block of 64 code points. 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start>>=6; 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=(start+1)<<6; // Round up to the next block boundary. 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru minStart=start; // Ignore further ranges in this block. 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start<limit) { 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start<(limit&~0x3f)) { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Multiple all-ones blocks of 64 code points each. 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru set32x64Bits(bmpBlockBits, start>>6, limit>>6); 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(limit&0x3f) { 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Mixed-value block of 64 code points. 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit>>=6; 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=(limit+1)<<6; // Round up to the next block boundary. 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru minStart=limit; // Ignore further ranges in this block. 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(limit==0x10000) { 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=list[listIndex++]; 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(listIndex<listLength) { 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=list[listIndex++]; 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit=0x110000; 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Override some bits and bytes to the result of contains(FFFD) 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for faster validity checking at runtime. 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * No need to set 0 values where they were reset to 0 in the constructor 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and not modified by initBits(). 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Need to set 0 values for surrogates D800..DFFF. 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid BMPSet::overrideIllegal() { 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t bits, mask; 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i; 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contains(FFFD)==TRUE 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0x80; i<0xc0; ++i) { 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru asciiBytes[i]=1; 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=3; // Lead bytes 0xC0 and 0xC1. 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<64; ++i) { 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru table7FF[i]|=bits; 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=1; // Lead byte 0xE0. 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<32; ++i) { // First half of 4k block. 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bmpBlockBits[i]|=bits; 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru mask=~(0x10001<<0xd); // Lead byte 0xED. 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=1<<0xd; 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=32; i<64; ++i) { // Second half of 4k block. 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // contains(FFFD)==FALSE 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru mask=~(0x10001<<0xd); // Lead byte 0xED. 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=32; i<64; ++i) { // Second half of 4k block. 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bmpBlockBits[i]&=mask; 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* Examples: 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru findCodePoint(c) 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru set list[] c=0 1 3 4 7 8 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru === ============== =========== 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru [] [110000] 0 0 0 0 0 0 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru [:Any:] [0, 110000] 1 1 1 1 1 1 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Return the smallest i such that c < list[i]. Assume 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (c < list[lo]) 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return lo; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // High runner test. c is often after the last range, so an 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // initial check for this condition pays off. 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (lo >= hi || c >= list[hi-1]) 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return hi; 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // invariant: c >= list[lo] 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // invariant: c < list[hi] 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (;;) { 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i = (lo + hi) >> 1; 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (i == lo) { 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; // Found! 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if (c < list[i]) { 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hi = i; 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru lo = i; 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return hi; 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::contains(UChar32 c) const { 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint32_t)c<=0x7f) { 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)asciiBytes[c]; 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)c<=0x7ff) { 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int lead=c>>12; 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with the same bits 15..6 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)twoBits; 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)c<=0x10ffff) { 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate or supplementary code point 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Out-of-range code points get FALSE, consistent with long-standing 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // behavior of UnicodeSet::contains(c). 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Check for sufficient length for trail unit for each surrogate pair. 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Handle single surrogates as surrogate code points as usual in ICU. 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar * 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c, c2; 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition) { 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // span 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c=*s; 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c<=0x7f) { 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!asciiBytes[c]) { 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<=0x7ff) { 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<0xd800 || c>=0xe000) { 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int lead=c>>12; 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with the same bits 15..6 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits==0) { 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate code point 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate pair 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++s; 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(++s<limit); 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // span not 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c=*s; 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c<=0x7f) { 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[c]) { 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<=0x7ff) { 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<0xd800 || c>=0xe000) { 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int lead=c>>12; 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with the same bits 15..6 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits!=0) { 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate code point 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate pair 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++s; 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(++s<limit); 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* Symmetrical with span(). */ 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst UChar * 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar c, c2; 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition) { 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // span 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c=*(--limit); 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c<=0x7f) { 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!asciiBytes[c]) { 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<=0x7ff) { 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<0xd800 || c>=0xe000) { 413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int lead=c>>12; 414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 416ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with the same bits 15..6 417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits==0) { 419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 426ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 427ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 428ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate code point 429ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 430ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 431ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 432ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 433ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate pair 434ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 435ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 436ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 437ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --limit; 438ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 439ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(s==limit) { 440ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 441ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 442ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 443ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 444ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // span not 445ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 446ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c=*(--limit); 447ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c<=0x7f) { 448ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[c]) { 449ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 450ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 451ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<=0x7ff) { 452ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 453ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 454ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 455ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<0xd800 || c>=0xe000) { 456ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int lead=c>>12; 457ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 458ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 459ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with the same bits 15..6 460ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 461ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits!=0) { 462ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 463ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 464ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 465ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 466ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 467ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 468ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 469ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 470ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 471ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate code point 472ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 473ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 474ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 475ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 476ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // surrogate pair 477ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 478ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 479ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 480ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --limit; 481ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 482ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(s==limit) { 483ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 484ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 485ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 486ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 487ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return limit+1; 488ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 489ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 490ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 491ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Precheck for sufficient trail bytes at end of string only once per span. 492ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Check validity. 493ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 494ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruconst uint8_t * 495ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 496ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *limit=s+length; 497ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t b=*s; 498ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)b>=0) { 499ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Initial all-ASCII span. 500ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition) { 501ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 502ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!asciiBytes[b] || ++s==limit) { 503ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 504ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 505ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=*s; 506ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while((int8_t)b>=0); 507ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 508ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 509ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[b] || ++s==limit) { 510ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 511ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 512ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=*s; 513ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while((int8_t)b>=0); 514ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 515ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length=(int32_t)(limit-s); 516ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 517ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 518ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 519ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 520ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 521ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 522ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *limit0=limit; 523ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 524ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 525ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Make sure that the last 1/2/3/4-byte sequence before limit is complete 526ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * or runs into a lead byte. 527ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * In the span loop compare s with limit only once 528ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * per multi-byte character. 529ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 530ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Give a trailing illegal sequence the same value as the result of contains(FFFD), 531ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * including it if that is part of the span, otherwise set limit0 to before 532ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the truncated sequence. 533ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 534ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=*(limit-1); 535ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)b<0) { 536ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // b>=0x80: lead or trail byte 537ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(b<0xc0) { 538ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // single trail byte, check for preceding 3- or 4-byte lead byte 539ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length>=2 && (b=*(limit-2))>=0xe0) { 540ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit-=2; 541ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[0x80]!=spanCondition) { 542ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit0=limit; 543ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 544ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { 545ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 4-byte lead byte with only two trail bytes 546ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit-=3; 547ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[0x80]!=spanCondition) { 548ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit0=limit; 549ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 550ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 551ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 552ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // lead byte with no trail bytes 553ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru --limit; 554ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[0x80]!=spanCondition) { 555ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru limit0=limit; 556ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 557ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 558ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 559ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 560ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t t1, t2, t3; 561ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 562ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(s<limit) { 563ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=*s; 564ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(b<0xc0) { 565ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // ASCII; or trail bytes with the result of contains(FFFD). 566ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition) { 567ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 568ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!asciiBytes[b]) { 569ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 570ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(++s==limit) { 571ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return limit0; 572ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 573ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=*s; 574ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(b<0xc0); 575ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 576ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 577ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[b]) { 578ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s; 579ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(++s==limit) { 580ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return limit0; 581ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 582ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=*s; 583ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(b<0xc0); 584ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 585ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 586ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++s; // Advance past the lead byte. 587ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(b>=0xe0) { 588ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(b<0xf0) { 589ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( /* handle U+0000..U+FFFF inline */ 590ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 591ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (t2=(uint8_t)(s[1]-0x80)) <= 0x3f 592ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 593ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b&=0xf; 594ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; 595ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 596ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with this lead byte and middle trail byte 597ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 598ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits!=(uint32_t)spanCondition) { 599ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s-1; 600ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 601ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 602ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 603ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c=(b<<12)|(t1<<6)|t2; 604ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { 605ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s-1; 606ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 607ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 608ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s+=2; 609ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 610ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 611ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if( /* handle U+10000..U+10FFFF inline */ 612ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 613ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && 614ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (t3=(uint8_t)(s[2]-0x80)) <= 0x3f 615ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 616ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Give an illegal sequence the same value as the result of contains(FFFD). 617ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; 618ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( ( (0x10000<=c && c<=0x10ffff) ? 619ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : 620ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru asciiBytes[0x80] 621ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) != spanCondition 622ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 623ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s-1; 624ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 625ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s+=3; 626ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 627ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 628ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else /* 0xc0<=b<0xe0 */ { 629ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if( /* handle U+0000..U+07FF inline */ 630ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (t1=(uint8_t)(*s-0x80)) <= 0x3f 631ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ) { 63285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { 633ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s-1; 634ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 635ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++s; 636ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru continue; 637ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 638ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Give an illegal sequence the same value as the result of contains(FFFD). 641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Handle each byte of an illegal sequence separately to simplify the code; 642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // no need to optimize error handling. 643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[0x80]!=spanCondition) { 644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return s-1; 645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return limit0; 649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * While going backwards through UTF-8 optimize only for ASCII. 653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not 654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * possible to tell from the last byte in a multi-byte sequence how many 655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * preceding bytes there should be. Therefore, going backwards through UTF-8 656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * is much harder than going forward. 657ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 658ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t 659ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 660ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 661ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 662ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 663ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 664ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t b; 665ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 666ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 667ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=s[--length]; 668ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((int8_t)b>=0) { 669ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // ASCII sub-span 670ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(spanCondition) { 671ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 672ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!asciiBytes[b]) { 673ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length+1; 674ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(length==0) { 675ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 676ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 677ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=s[--length]; 678ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while((int8_t)b>=0); 679ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 680ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 681ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(asciiBytes[b]) { 682ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return length+1; 683ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(length==0) { 684ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 685ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 686ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=s[--length]; 687ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while((int8_t)b>=0); 688ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 689ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 690ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 691ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t prev=length; 692ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c; 6938393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius // trail byte: collect a multi-byte character 6948393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius // (or lead byte in last-trail position) 6958393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=utf8_prevCharSafeBody(s, 0, &length, b, -3); 696ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // c is a valid code point, not ASCII, not a surrogate 697ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(c<=0x7ff) { 69885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { 699ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prev+1; 700ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 701ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(c<=0xffff) { 702ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int lead=c>>12; 703ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 704ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits<=1) { 705ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // All 64 code points with the same bits 15..6 706ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // are either in the set or not. 707ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(twoBits!=(uint32_t)spanCondition) { 708ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prev+1; 709ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 710ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 711ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look up the code point in its 4k block of code points. 712ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { 713ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prev+1; 714ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 715ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 716ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 717ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { 718ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prev+1; 719ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 720ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 721ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(length>0); 722ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; 723ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 724ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 725ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 726