1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Copyright (C) 2007, International Business Machines
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru**********************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  trieset.cpp
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:4
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2007jan15
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus Scherer
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   using a UTrie with 8-bit (byte) results per code point.
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Modifies the trie index to make the BMP linear, and uses the original set
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   for supplementary code points.
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicont.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define UTRIE_GET8_FROM_LEAD(trie, c16) \
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ((const uint8_t *)(trie)->data32)[ \
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ((c16)&UTRIE_MASK) \
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ]
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass TrieSet : public UObject, public UnicodeContainable {
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            : trieData(NULL), latin1(NULL), restSet(set.clone()) {
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(restSet==NULL) {
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_MEMORY_ALLOCATION_ERROR;
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar32 start, end;
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UnicodeSetIterator iter(set);
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while(iter.nextRange() && !iter.isString()) {
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            start=iter.getCodepoint();
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            end=iter.getCodepointEnd();
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(start>0xffff) {
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                break;
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(end>0xffff) {
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                end=0xffff;
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                errorCode=U_INTERNAL_PROGRAM_ERROR;
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                return;
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Preflight the trie length.
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        trieData=(uint32_t *)uprv_malloc(length);
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(trieData==NULL) {
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_MEMORY_ALLOCATION_ERROR;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return;
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        errorCode=U_ZERO_ERROR;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_SUCCESS(errorCode)) {
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // Copy the indexes for surrogate code points into the BMP range
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // for simple access across the entire BMP.
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        trie.index+UTRIE_BMP_INDEX_LENGTH,
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        (0x800>>UTRIE_SHIFT)*2);
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            latin1=UTRIE_GET8_LATIN1(&trie);
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        restSet.remove(0, 0xffff);
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ~TrieSet() {
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        uprv_free(trieData);
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete restSet;
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool contains(UChar32 c) const {
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if((uint32_t)c<=0xff) {
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)latin1[c];
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if((uint32_t)c<0xffff) {
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return restSet->contains(c);
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t *trieData;
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const uint8_t *latin1;
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UTrie trie;
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *restSet;
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
112