1/*
2**********************************************************************
3*   Copyright (C) 2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  bitset.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2007jan15
12*   created by: Markus Scherer
13*
14*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15*   using a folded bit set consisting of a 1k-entry index table and a
16*   compacted array of 64-bit words.
17*   Uses a simple hash table for compaction.
18*   Uses the original set for supplementary code points.
19*/
20
21#include "unicode/utypes.h"
22#include "unicont.h"
23#include "cmemory.h" // for UPRV_LENGTHOF
24
25/*
26 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
27 * Hashes 64-bit words and maps them to 16-bit integers which are
28 * assigned in order of new incoming words for subsequent storage
29 * in a contiguous array.
30 */
31struct BMPBitHash : public UObject {
32    int64_t keys[0x800];  // 2k
33    uint16_t values[0x800];
34    uint16_t reverse[0x400];
35    uint16_t count;
36    const int32_t prime=1301;  // Less than 2k.
37
38    BMPBitHash() : count(0) {
39        // Fill values[] with 0xffff.
40        uprv_memset(values, 0xff, sizeof(values));
41    }
42
43    /*
44     * Map a key to an integer count.
45     * Map at most 1k=0x400 different keys with this data structure.
46     */
47    uint16_t map(int64_t key) {
48        int32_t hash=(int32_t)(key>>55)&0x1ff;
49        hash^=(int32_t)(key>>44)&0x7ff;
50        hash^=(int32_t)(key>>33)&0x7ff;
51        hash^=(int32_t)(key>>22)&0x7ff;
52        hash^=(int32_t)(key>>11)&0x7ff;
53        hash^=(int32_t)key&0x7ff;
54        for(;;) {
55            if(values[hash]==0xffff) {
56                // Unused slot.
57                keys[hash]=key;
58                reverse[count]=hash;
59                return values[hash]=count++;
60            } else if(keys[hash]==key) {
61                // Found a slot with this key.
62                return values[hash];
63            } else {
64                // Used slot with a different key, move to another slot.
65                hash=(hash+prime)&0x7ff;
66            }
67        }
68    }
69
70    uint16_t countKeys() const { return count; }
71
72    /*
73     * Invert the hash map: Fill an array of length countKeys() with the keys
74     * indexed by their mapped values.
75     */
76    void invert(int64_t *k) const {
77        uint16_t i;
78
79        for(i=0; i<count; ++i) {
80            k[i]=keys[reverse[i]];
81        }
82    }
83};
84
85class BitSet : public UObject, public UnicodeContainable {
86public:
87    BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
88        if(U_FAILURE(errorCode)) {
89            return;
90        }
91        BMPBitHash *bitHash=new BMPBitHash;
92        if(bitHash==NULL || restSet==NULL) {
93            errorCode=U_MEMORY_ALLOCATION_ERROR;
94            return;
95        }
96
97        UnicodeSetIterator iter(set);
98        int64_t b;
99        UChar32 start, end;
100        int32_t prevIndex, i, j;
101
102        b=0;  // Not necessary but makes compilers happy.
103        prevIndex=-1;
104        for(;;) {
105            if(iter.nextRange() && !iter.isString()) {
106                start=iter.getCodepoint();
107                end=iter.getCodepointEnd();
108            } else {
109                start=0x10000;
110            }
111            i=start>>6;
112            if(prevIndex!=i) {
113                // Finish the end of the previous range.
114                if(prevIndex<0) {
115                    prevIndex=0;
116                } else {
117                    index[prevIndex++]=bitHash->map(b);
118                }
119                // Fill all-zero entries between ranges.
120                if(prevIndex<i) {
121                    uint16_t zero=bitHash->map(0);
122                    do {
123                        index[prevIndex++]=zero;
124                    } while(prevIndex<i);
125                }
126                b=0;
127            }
128            if(start>0xffff) {
129                break;
130            }
131            b|=~((INT64_C(1)<<(start&0x3f))-1);
132            j=end>>6;
133            if(i<j) {
134                // Set bits for the start of the range.
135                index[i++]=bitHash->map(b);
136                // Fill all-one entries inside the range.
137                if(i<j) {
138                    uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
139                    do {
140                        index[i++]=all;
141                    } while(i<j);
142                }
143                b=INT64_C(0xffffffffffffffff);
144            }
145            /* i==j */
146            b&=(INT64_C(1)<<(end&0x3f))-1;
147            prevIndex=j;
148        }
149
150        if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
151            bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
152        }
153        if(bits!=NULL) {
154            bitHash->invert(bits);
155        } else {
156            bits=shortBits;
157            errorCode=U_MEMORY_ALLOCATION_ERROR;
158            return;
159        }
160
161        latin1Set[0]=(uint32_t)bits[0];
162        latin1Set[1]=(uint32_t)(bits[0]>>32);
163        latin1Set[2]=(uint32_t)bits[1];
164        latin1Set[3]=(uint32_t)(bits[1]>>32);
165        latin1Set[4]=(uint32_t)bits[2];
166        latin1Set[5]=(uint32_t)(bits[2]>>32);
167        latin1Set[6]=(uint32_t)bits[3];
168        latin1Set[7]=(uint32_t)(bits[3]>>32);
169
170        restSet.remove(0, 0xffff);
171    }
172
173    ~BitSet() {
174        if(bits!=shortBits) {
175            uprv_free(bits);
176        }
177        delete restSet;
178    }
179
180    UBool contains(UChar32 c) const {
181        if((uint32_t)c<=0xff) {
182            return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
183        } else if((uint32_t)c<0xffff) {
184            return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
185        } else {
186            return restSet->contains(c);
187        }
188    }
189
190private:
191    uint16_t index[0x400];
192    int64_t shortBits[32];
193    int64_t *bits;
194
195    uint32_t latin1Bits[8];
196
197    UnicodeSet *restSet;
198};
199