1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 3f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius* Copyright (C) 2014, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: bitset.cpp 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007jan15 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus Scherer 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* using a folded bit set consisting of a 1k-entry index table and a 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* compacted array of 64-bit words. 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Uses a simple hash table for compaction. 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Uses the original set for supplementary code points. 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicont.h" 23f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "cmemory.h" // for UPRV_LENGTHOF 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hashes 64-bit words and maps them to 16-bit integers which are 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * assigned in order of new incoming words for subsequent storage 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in a contiguous array. 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct BMPBitHash : public UObject { 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t keys[0x800]; // 2k 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t values[0x800]; 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t reverse[0x400]; 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t count; 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const int32_t prime=1301; // Less than 2k. 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BMPBitHash() : count(0) { 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill values[] with 0xffff. 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(values, 0xff, sizeof(values)); 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Map a key to an integer count. 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Map at most 1k=0x400 different keys with this data structure. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t map(int64_t key) { 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t hash=(int32_t)(key>>55)&0x1ff; 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>44)&0x7ff; 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>33)&0x7ff; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>22)&0x7ff; 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)(key>>11)&0x7ff; 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash^=(int32_t)key&0x7ff; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(values[hash]==0xffff) { 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Unused slot. 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru keys[hash]=key; 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru reverse[count]=hash; 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return values[hash]=count++; 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(keys[hash]==key) { 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Found a slot with this key. 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return values[hash]; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Used slot with a different key, move to another slot. 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru hash=(hash+prime)&0x7ff; 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t countKeys() const { return count; } 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Invert the hash map: Fill an array of length countKeys() with the keys 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * indexed by their mapped values. 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void invert(int64_t *k) const { 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t i; 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(i=0; i<count; ++i) { 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru k[i]=keys[reverse[i]]; 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass BitSet : public UObject, public UnicodeContainable { 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru BMPBitHash *bitHash=new BMPBitHash; 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bitHash==NULL || restSet==NULL) { 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSetIterator iter(set); 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t b; 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 start, end; 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t prevIndex, i, j; 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=0; // Not necessary but makes compilers happy. 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevIndex=-1; 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(iter.nextRange() && !iter.isString()) { 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=iter.getCodepoint(); 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru end=iter.getCodepointEnd(); 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=0x10000; 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=start>>6; 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(prevIndex!=i) { 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Finish the end of the previous range. 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(prevIndex<0) { 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevIndex=0; 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[prevIndex++]=bitHash->map(b); 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill all-zero entries between ranges. 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(prevIndex<i) { 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t zero=bitHash->map(0); 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[prevIndex++]=zero; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(prevIndex<i); 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=0; 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start>0xffff) { 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b|=~((INT64_C(1)<<(start&0x3f))-1); 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru j=end>>6; 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i<j) { 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Set bits for the start of the range. 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[i++]=bitHash->map(b); 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill all-one entries inside the range. 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i<j) { 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru index[i++]=all; 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(i<j); 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=INT64_C(0xffffffffffffffff); 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* i==j */ 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b&=(INT64_C(1)<<(end&0x3f))-1; 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prevIndex=j; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 150f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bits!=NULL) { 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bitHash->invert(bits); 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru bits=shortBits; 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[0]=(uint32_t)bits[0]; 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[1]=(uint32_t)(bits[0]>>32); 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[2]=(uint32_t)bits[1]; 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[3]=(uint32_t)(bits[1]>>32); 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[4]=(uint32_t)bits[2]; 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[5]=(uint32_t)(bits[2]>>32); 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[6]=(uint32_t)bits[3]; 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1Set[7]=(uint32_t)(bits[3]>>32); 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru restSet.remove(0, 0xffff); 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~BitSet() { 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(bits!=shortBits) { 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(bits); 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete restSet; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool contains(UChar32 c) const { 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint32_t)c<=0xff) { 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)c<0xffff) { 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return restSet->contains(c); 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t index[0x400]; 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t shortBits[32]; 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int64_t *bits; 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t latin1Bits[8]; 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *restSet; 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 199