1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
3f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius*   Copyright (C) 2014, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  bitset.cpp
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2007jan15
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus Scherer
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   using a folded bit set consisting of a 1k-entry index table and a
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   compacted array of 64-bit words.
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Uses a simple hash table for compaction.
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Uses the original set for supplementary code points.
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicont.h"
23f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "cmemory.h" // for UPRV_LENGTHOF
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Hashes 64-bit words and maps them to 16-bit integers which are
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * assigned in order of new incoming words for subsequent storage
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * in a contiguous array.
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct BMPBitHash : public UObject {
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int64_t keys[0x800];  // 2k
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t values[0x800];
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t reverse[0x400];
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t count;
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const int32_t prime=1301;  // Less than 2k.
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BMPBitHash() : count(0) {
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Fill values[] with 0xffff.
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_memset(values, 0xff, sizeof(values));
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Map a key to an integer count.
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Map at most 1k=0x400 different keys with this data structure.
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t map(int64_t key) {
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t hash=(int32_t)(key>>55)&0x1ff;
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>44)&0x7ff;
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>33)&0x7ff;
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>22)&0x7ff;
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)(key>>11)&0x7ff;
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        hash^=(int32_t)key&0x7ff;
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(;;) {
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(values[hash]==0xffff) {
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Unused slot.
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                keys[hash]=key;
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                reverse[count]=hash;
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return values[hash]=count++;
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else if(keys[hash]==key) {
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Found a slot with this key.
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return values[hash];
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Used slot with a different key, move to another slot.
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                hash=(hash+prime)&0x7ff;
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t countKeys() const { return count; }
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /*
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Invert the hash map: Fill an array of length countKeys() with the keys
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * indexed by their mapped values.
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void invert(int64_t *k) const {
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uint16_t i;
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(i=0; i<count; ++i) {
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            k[i]=keys[reverse[i]];
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass BitSet : public UObject, public UnicodeContainable {
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        BMPBitHash *bitHash=new BMPBitHash;
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bitHash==NULL || restSet==NULL) {
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_MEMORY_ALLOCATION_ERROR;
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UnicodeSetIterator iter(set);
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int64_t b;
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar32 start, end;
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t prevIndex, i, j;
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        b=0;  // Not necessary but makes compilers happy.
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        prevIndex=-1;
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        for(;;) {
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(iter.nextRange() && !iter.isString()) {
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=iter.getCodepoint();
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                end=iter.getCodepointEnd();
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            } else {
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                start=0x10000;
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            i=start>>6;
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(prevIndex!=i) {
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Finish the end of the previous range.
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(prevIndex<0) {
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    prevIndex=0;
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    index[prevIndex++]=bitHash->map(b);
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Fill all-zero entries between ranges.
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(prevIndex<i) {
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint16_t zero=bitHash->map(0);
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    do {
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        index[prevIndex++]=zero;
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } while(prevIndex<i);
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                b=0;
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(start>0xffff) {
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            b|=~((INT64_C(1)<<(start&0x3f))-1);
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            j=end>>6;
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(i<j) {
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Set bits for the start of the range.
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                index[i++]=bitHash->map(b);
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Fill all-one entries inside the range.
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(i<j) {
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    do {
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        index[i++]=all;
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    } while(i<j);
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                b=INT64_C(0xffffffffffffffff);
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            /* i==j */
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            b&=(INT64_C(1)<<(end&0x3f))-1;
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            prevIndex=j;
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
150f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bits!=NULL) {
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bitHash->invert(bits);
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            bits=shortBits;
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_MEMORY_ALLOCATION_ERROR;
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[0]=(uint32_t)bits[0];
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[1]=(uint32_t)(bits[0]>>32);
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[2]=(uint32_t)bits[1];
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[3]=(uint32_t)(bits[1]>>32);
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[4]=(uint32_t)bits[2];
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[5]=(uint32_t)(bits[2]>>32);
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[6]=(uint32_t)bits[3];
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        latin1Set[7]=(uint32_t)(bits[3]>>32);
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        restSet.remove(0, 0xffff);
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~BitSet() {
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(bits!=shortBits) {
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uprv_free(bits);
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete restSet;
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool contains(UChar32 c) const {
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((uint32_t)c<=0xff) {
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if((uint32_t)c<0xffff) {
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return restSet->contains(c);
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t index[0x400];
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int64_t shortBits[32];
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int64_t *bits;
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t latin1Bits[8];
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *restSet;
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
199