1/*
2**********************************************************************
3*   Copyright (C) 2007, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  trieset.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2007jan15
12*   created by: Markus Scherer
13*
14*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15*   using a UTrie with 8-bit (byte) results per code point.
16*   Modifies the trie index to make the BMP linear, and uses the original set
17*   for supplementary code points.
18*/
19
20#include "unicode/utypes.h"
21#include "unicont.h"
22
23#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
24
25#define UTRIE_GET8_FROM_LEAD(trie, c16) \
26    ((const uint8_t *)(trie)->data32)[ \
27        ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
28        ((c16)&UTRIE_MASK) \
29    ]
30
31class TrieSet : public UObject, public UnicodeContainable {
32public:
33    TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
34            : trieData(NULL), latin1(NULL), restSet(set.clone()) {
35        if(U_FAILURE(errorCode)) {
36            return;
37        }
38        if(restSet==NULL) {
39            errorCode=U_MEMORY_ALLOCATION_ERROR;
40            return;
41        }
42
43        UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
44        UChar32 start, end;
45
46        UnicodeSetIterator iter(set);
47
48        while(iter.nextRange() && !iter.isString()) {
49            start=iter.getCodepoint();
50            end=iter.getCodepointEnd();
51            if(start>0xffff) {
52                break;
53            }
54            if(end>0xffff) {
55                end=0xffff;
56            }
57            if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
58                errorCode=U_INTERNAL_PROGRAM_ERROR;
59                return;
60            }
61        }
62
63        // Preflight the trie length.
64        int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
65        if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
66            return;
67        }
68
69        trieData=(uint32_t *)uprv_malloc(length);
70        if(trieData==NULL) {
71            errorCode=U_MEMORY_ALLOCATION_ERROR;
72            return;
73        }
74
75        errorCode=U_ZERO_ERROR;
76        utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
77        utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
78
79        if(U_SUCCESS(errorCode)) {
80            // Copy the indexes for surrogate code points into the BMP range
81            // for simple access across the entire BMP.
82            uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
83                        trie.index+UTRIE_BMP_INDEX_LENGTH,
84                        (0x800>>UTRIE_SHIFT)*2);
85            latin1=UTRIE_GET8_LATIN1(&trie);
86        }
87
88        restSet.remove(0, 0xffff);
89    }
90
91    ~TrieSet() {
92        uprv_free(trieData);
93        delete restSet;
94    }
95
96    UBool contains(UChar32 c) const {
97        if((uint32_t)c<=0xff) {
98            return (UBool)latin1[c];
99        } else if((uint32_t)c<0xffff) {
100            return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
101        } else {
102            return restSet->contains(c);
103        }
104    }
105
106private:
107    uint32_t *trieData;
108    const uint8_t *latin1;
109    UTrie trie;
110    UnicodeSet *restSet;
111};
112