1/*
2*************************************************************************
3*   Copyright (C) 2016 and later: Unicode, Inc. and others.
4*   License & terms of use: http://www.unicode.org/copyright.html#License
5*************************************************************************
6*************************************************************************
7*   Copyright (C) 2014, International Business Machines
8*   Corporation and others.  All Rights Reserved.
9*************************************************************************
10*   file name:  bitset.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2007jan15
16*   created by: Markus Scherer
17*
18*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19*   using a folded bit set consisting of a 1k-entry index table and a
20*   compacted array of 64-bit words.
21*   Uses a simple hash table for compaction.
22*   Uses the original set for supplementary code points.
23*/
24
25#include "unicode/utypes.h"
26#include "unicont.h"
27#include "cmemory.h" // for UPRV_LENGTHOF
28
29/*
30 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
31 * Hashes 64-bit words and maps them to 16-bit integers which are
32 * assigned in order of new incoming words for subsequent storage
33 * in a contiguous array.
34 */
35struct BMPBitHash : public UObject {
36    int64_t keys[0x800];  // 2k
37    uint16_t values[0x800];
38    uint16_t reverse[0x400];
39    uint16_t count;
40    const int32_t prime=1301;  // Less than 2k.
41
42    BMPBitHash() : count(0) {
43        // Fill values[] with 0xffff.
44        uprv_memset(values, 0xff, sizeof(values));
45    }
46
47    /*
48     * Map a key to an integer count.
49     * Map at most 1k=0x400 different keys with this data structure.
50     */
51    uint16_t map(int64_t key) {
52        int32_t hash=(int32_t)(key>>55)&0x1ff;
53        hash^=(int32_t)(key>>44)&0x7ff;
54        hash^=(int32_t)(key>>33)&0x7ff;
55        hash^=(int32_t)(key>>22)&0x7ff;
56        hash^=(int32_t)(key>>11)&0x7ff;
57        hash^=(int32_t)key&0x7ff;
58        for(;;) {
59            if(values[hash]==0xffff) {
60                // Unused slot.
61                keys[hash]=key;
62                reverse[count]=hash;
63                return values[hash]=count++;
64            } else if(keys[hash]==key) {
65                // Found a slot with this key.
66                return values[hash];
67            } else {
68                // Used slot with a different key, move to another slot.
69                hash=(hash+prime)&0x7ff;
70            }
71        }
72    }
73
74    uint16_t countKeys() const { return count; }
75
76    /*
77     * Invert the hash map: Fill an array of length countKeys() with the keys
78     * indexed by their mapped values.
79     */
80    void invert(int64_t *k) const {
81        uint16_t i;
82
83        for(i=0; i<count; ++i) {
84            k[i]=keys[reverse[i]];
85        }
86    }
87};
88
89class BitSet : public UObject, public UnicodeContainable {
90public:
91    BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
92        if(U_FAILURE(errorCode)) {
93            return;
94        }
95        BMPBitHash *bitHash=new BMPBitHash;
96        if(bitHash==NULL || restSet==NULL) {
97            errorCode=U_MEMORY_ALLOCATION_ERROR;
98            return;
99        }
100
101        UnicodeSetIterator iter(set);
102        int64_t b;
103        UChar32 start, end;
104        int32_t prevIndex, i, j;
105
106        b=0;  // Not necessary but makes compilers happy.
107        prevIndex=-1;
108        for(;;) {
109            if(iter.nextRange() && !iter.isString()) {
110                start=iter.getCodepoint();
111                end=iter.getCodepointEnd();
112            } else {
113                start=0x10000;
114            }
115            i=start>>6;
116            if(prevIndex!=i) {
117                // Finish the end of the previous range.
118                if(prevIndex<0) {
119                    prevIndex=0;
120                } else {
121                    index[prevIndex++]=bitHash->map(b);
122                }
123                // Fill all-zero entries between ranges.
124                if(prevIndex<i) {
125                    uint16_t zero=bitHash->map(0);
126                    do {
127                        index[prevIndex++]=zero;
128                    } while(prevIndex<i);
129                }
130                b=0;
131            }
132            if(start>0xffff) {
133                break;
134            }
135            b|=~((INT64_C(1)<<(start&0x3f))-1);
136            j=end>>6;
137            if(i<j) {
138                // Set bits for the start of the range.
139                index[i++]=bitHash->map(b);
140                // Fill all-one entries inside the range.
141                if(i<j) {
142                    uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
143                    do {
144                        index[i++]=all;
145                    } while(i<j);
146                }
147                b=INT64_C(0xffffffffffffffff);
148            }
149            /* i==j */
150            b&=(INT64_C(1)<<(end&0x3f))-1;
151            prevIndex=j;
152        }
153
154        if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
155            bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
156        }
157        if(bits!=NULL) {
158            bitHash->invert(bits);
159        } else {
160            bits=shortBits;
161            errorCode=U_MEMORY_ALLOCATION_ERROR;
162            return;
163        }
164
165        latin1Set[0]=(uint32_t)bits[0];
166        latin1Set[1]=(uint32_t)(bits[0]>>32);
167        latin1Set[2]=(uint32_t)bits[1];
168        latin1Set[3]=(uint32_t)(bits[1]>>32);
169        latin1Set[4]=(uint32_t)bits[2];
170        latin1Set[5]=(uint32_t)(bits[2]>>32);
171        latin1Set[6]=(uint32_t)bits[3];
172        latin1Set[7]=(uint32_t)(bits[3]>>32);
173
174        restSet.remove(0, 0xffff);
175    }
176
177    ~BitSet() {
178        if(bits!=shortBits) {
179            uprv_free(bits);
180        }
181        delete restSet;
182    }
183
184    UBool contains(UChar32 c) const {
185        if((uint32_t)c<=0xff) {
186            return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
187        } else if((uint32_t)c<0xffff) {
188            return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
189        } else {
190            return restSet->contains(c);
191        }
192    }
193
194private:
195    uint16_t index[0x400];
196    int64_t shortBits[32];
197    int64_t *bits;
198
199    uint32_t latin1Bits[8];
200
201    UnicodeSet *restSet;
202};
203