1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Copyright (C) 2007, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  bitset.cpp
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2007jan15
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus Scherer
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   using a folded bit set consisting of a 1k-entry index table and a
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   compacted array of 64-bit words.
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Uses a simple hash table for compaction.
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Uses the original set for supplementary code points.
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicont.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hashes 64-bit words and maps them to 16-bit integers which are
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * assigned in order of new incoming words for subsequent storage
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in a contiguous array.
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct BMPBitHash : public UObject {
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int64_t keys[0x800];  // 2k
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t values[0x800];
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t reverse[0x400];
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t count;
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const int32_t prime=1301;  // Less than 2k.
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BMPBitHash() : count(0) {
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Fill values[] with 0xffff.
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_memset(values, 0xff, sizeof(values));
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Map a key to an integer count.
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Map at most 1k=0x400 different keys with this data structure.
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t map(int64_t key) {
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t hash=(int32_t)(key>>55)&0x1ff;
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>44)&0x7ff;
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>33)&0x7ff;
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>22)&0x7ff;
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>11)&0x7ff;
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)key&0x7ff;
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(;;) {
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(values[hash]==0xffff) {
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Unused slot.
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                keys[hash]=key;
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                reverse[count]=hash;
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return values[hash]=count++;
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(keys[hash]==key) {
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Found a slot with this key.
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return values[hash];
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Used slot with a different key, move to another slot.
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                hash=(hash+prime)&0x7ff;
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t countKeys() const { return count; }
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Invert the hash map: Fill an array of length countKeys() with the keys
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * indexed by their mapped values.
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void invert(int64_t *k) const {
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uint16_t i;
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<count; ++i) {
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            k[i]=keys[reverse[i]];
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass BitSet : public UObject, public UnicodeContainable {
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        BMPBitHash *bitHash=new BMPBitHash;
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bitHash==NULL || restSet==NULL) {
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_MEMORY_ALLOCATION_ERROR;
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UnicodeSetIterator iter(set);
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int64_t b;
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar32 start, end;
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t prevIndex, i, j;
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        b=0;  // Not necessary but makes compilers happy.
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        prevIndex=-1;
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(;;) {
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(iter.nextRange() && !iter.isString()) {
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=iter.getCodepoint();
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                end=iter.getCodepointEnd();
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=0x10000;
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i=start>>6;
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(prevIndex!=i) {
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Finish the end of the previous range.
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(prevIndex<0) {
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    prevIndex=0;
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    index[prevIndex++]=bitHash->map(b);
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Fill all-zero entries between ranges.
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(prevIndex<i) {
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint16_t zero=bitHash->map(0);
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    do {
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        index[prevIndex++]=zero;
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } while(prevIndex<i);
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                b=0;
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(start>0xffff) {
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            b|=~((INT64_C(1)<<(start&0x3f))-1);
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            j=end>>6;
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(i<j) {
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Set bits for the start of the range.
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                index[i++]=bitHash->map(b);
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Fill all-one entries inside the range.
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(i<j) {
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    do {
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        index[i++]=all;
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } while(i<j);
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                b=INT64_C(0xffffffffffffffff);
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            /* i==j */
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            b&=(INT64_C(1)<<(end&0x3f))-1;
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            prevIndex=j;
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bitHash->countKeys()>LENGTHOF(shortBits)) {
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bits!=NULL) {
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bitHash->invert(bits);
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bits=shortBits;
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_MEMORY_ALLOCATION_ERROR;
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[0]=(uint32_t)bits[0];
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[1]=(uint32_t)(bits[0]>>32);
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[2]=(uint32_t)bits[1];
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[3]=(uint32_t)(bits[1]>>32);
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[4]=(uint32_t)bits[2];
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[5]=(uint32_t)(bits[2]>>32);
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[6]=(uint32_t)bits[3];
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[7]=(uint32_t)(bits[3]>>32);
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        restSet.remove(0, 0xffff);
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~BitSet() {
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bits!=shortBits) {
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uprv_free(bits);
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete restSet;
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool contains(UChar32 c) const {
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((uint32_t)c<=0xff) {
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if((uint32_t)c<0xffff) {
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return restSet->contains(c);
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t index[0x400];
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int64_t shortBits[32];
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int64_t *bits;
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t latin1Bits[8];
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *restSet;
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
198