1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Copyright (C) 2007, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: bmpset.h 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007jan29 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus W. Scherer 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef __BMPSET_H__ 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define __BMPSET_H__ 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Helper class for frozen UnicodeSets, implements contains() and span() 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * optimized for BMP code points. Structured to be UTF-8-friendly. 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * ASCII: Look up bytes. 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 2-byte characters: Bits organized vertically. 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with mixed for illegal ranges. 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Supplementary characters: Call contains() on the parent set. 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass BMPSet : public UMemory { 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BMPSet(const int32_t *parentList, int32_t parentListLength); 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength); 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~BMPSet(); 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual UBool contains(UChar32 c) const; 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Span the initial substring for which each character c has spanCondition==contains(c). 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It must be s<limit and spanCondition==0 or 1. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return The string pointer which limits the span. 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const; 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Span the trailing substring for which each character c has spanCondition==contains(c). 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It must be s<limit and spanCondition==0 or 1. 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return The string pointer which starts the span. 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const; 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Span the initial substring for which each character c has spanCondition==contains(c). 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It must be length>0 and spanCondition==0 or 1. 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return The string pointer which limits the span. 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Span the trailing substring for which each character c has spanCondition==contains(c). 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * It must be length>0 and spanCondition==0 or 1. 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return The start of the span. 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void initBits(); 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void overrideIllegal(); 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * binary search is restricted for finding code points in a certain range. 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For restricting the search for finding in the range start..end, 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * pass in 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * lo=findCodePoint(start) and 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * hi=findCodePoint(end) 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * with 0<=lo<=hi<len. 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * findCodePoint(c) defaults to lo=0 and hi=len-1. 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param c a character in a subrange of MIN_VALUE..MAX_VALUE 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param lo The lowest index to be returned. 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param hi The highest index to be returned. 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return the smallest integer i in the range lo..hi, 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * inclusive, such that c < list[i] 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const; 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const; 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * One byte per ASCII character, or trail byte in lead position. 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 0 or 1 for ASCII characters. 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The value for trail bytes is the result of contains(FFFD) 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for faster validity checking at runtime. 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool asciiBytes[0xc0]; 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * One bit per code point from U+0000..U+07FF. 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The bits are organized vertically; consecutive code points 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * correspond to the same bit positions in consecutive table words. 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * With code point parts 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * lead=c{10..6} 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * trail=c{5..0} 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * it is set.contains(c)==(table7FF[trail] bit lead) 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for faster validity checking at runtime. 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t table7FF[64]; 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * One bit per 64 BMP code points. 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The bits are organized vertically; consecutive 64-code point blocks 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * correspond to the same bit position in consecutive table words. 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * With code point parts 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * lead=c{15..12} 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * t1=c{11..6} 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * test bits (lead+16) and lead in bmpBlockBits[t1]. 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * If the upper bit is 0, then the lower bit indicates if contains(c) 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for all code points in the 64-block. 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * If the upper bit is 1, then the block is mixed and set.contains(c) 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * must be called. 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the result of contains(FFFD) for faster validity checking at runtime. 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t bmpBlockBits[64]; 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Inversion list indexes for restricted binary searches in 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * findCodePoint(), from 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000). 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * always looked up in the bit tables. 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The last pair of indexes is for finding supplementary code points. 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t list4kStarts[18]; 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The inversion list of the parent set, for the slower contains() implementation 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * for mixed BMP blocks and for supplementary code points. 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The list is terminated with list[listLength-1]=0x110000. 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const int32_t *list; 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t listLength; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const { 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)(findCodePoint(c, lo, hi) & 1); 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 162