1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Copyright (C) 2007, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: bitset.cpp 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007jan15 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus Scherer 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* using a folded bit set consisting of a 1k-entry index table and a 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* compacted array of 64-bit words. 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Uses a simple hash table for compaction. 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Uses the original set for supplementary code points. 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicont.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hashes 64-bit words and maps them to 16-bit integers which are 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * assigned in order of new incoming words for subsequent storage 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in a contiguous array. 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct BMPBitHash : public UObject { 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t keys[0x800]; // 2k 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t values[0x800]; 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t reverse[0x400]; 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t count; 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const int32_t prime=1301; // Less than 2k. 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BMPBitHash() : count(0) { 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill values[] with 0xffff. 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(values, 0xff, sizeof(values)); 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Map a key to an integer count. 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Map at most 1k=0x400 different keys with this data structure. 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t map(int64_t key) { 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t hash=(int32_t)(key>>55)&0x1ff; 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>44)&0x7ff; 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>33)&0x7ff; 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>22)&0x7ff; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>11)&0x7ff; 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)key&0x7ff; 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(values[hash]==0xffff) { 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Unused slot. 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru keys[hash]=key; 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru reverse[count]=hash; 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return values[hash]=count++; 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(keys[hash]==key) { 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Found a slot with this key. 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return values[hash]; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Used slot with a different key, move to another slot. 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash=(hash+prime)&0x7ff; 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t countKeys() const { return count; } 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Invert the hash map: Fill an array of length countKeys() with the keys 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * indexed by their mapped values. 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void invert(int64_t *k) const { 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t i; 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<count; ++i) { 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru k[i]=keys[reverse[i]]; 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass BitSet : public UObject, public UnicodeContainable { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BMPBitHash *bitHash=new BMPBitHash; 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bitHash==NULL || restSet==NULL) { 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSetIterator iter(set); 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t b; 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 start, end; 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t prevIndex, i, j; 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=0; // Not necessary but makes compilers happy. 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevIndex=-1; 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(iter.nextRange() && !iter.isString()) { 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=iter.getCodepoint(); 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru end=iter.getCodepointEnd(); 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=0x10000; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=start>>6; 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(prevIndex!=i) { 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finish the end of the previous range. 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(prevIndex<0) { 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevIndex=0; 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[prevIndex++]=bitHash->map(b); 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill all-zero entries between ranges. 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(prevIndex<i) { 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t zero=bitHash->map(0); 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[prevIndex++]=zero; 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(prevIndex<i); 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=0; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start>0xffff) { 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b|=~((INT64_C(1)<<(start&0x3f))-1); 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru j=end>>6; 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i<j) { 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set bits for the start of the range. 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[i++]=bitHash->map(b); 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill all-one entries inside the range. 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i<j) { 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[i++]=all; 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(i<j); 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=INT64_C(0xffffffffffffffff); 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* i==j */ 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b&=(INT64_C(1)<<(end&0x3f))-1; 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevIndex=j; 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bitHash->countKeys()>LENGTHOF(shortBits)) { 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bits!=NULL) { 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bitHash->invert(bits); 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=shortBits; 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[0]=(uint32_t)bits[0]; 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[1]=(uint32_t)(bits[0]>>32); 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[2]=(uint32_t)bits[1]; 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[3]=(uint32_t)(bits[1]>>32); 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[4]=(uint32_t)bits[2]; 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[5]=(uint32_t)(bits[2]>>32); 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[6]=(uint32_t)bits[3]; 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[7]=(uint32_t)(bits[3]>>32); 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru restSet.remove(0, 0xffff); 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~BitSet() { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bits!=shortBits) { 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(bits); 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete restSet; 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool contains(UChar32 c) const { 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint32_t)c<=0xff) { 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)c<0xffff) { 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return restSet->contains(c); 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t index[0x400]; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t shortBits[32]; 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t *bits; 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t latin1Bits[8]; 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *restSet; 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 198