1/* 2********************************************************************** 3* Copyright (C) 2007, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* file name: trieset.cpp 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2007jan15 12* created by: Markus Scherer 13* 14* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 15* using a UTrie with 8-bit (byte) results per code point. 16* Modifies the trie index to make the BMP linear, and uses the original set 17* for supplementary code points. 18*/ 19 20#include "unicode/utypes.h" 21#include "unicont.h" 22 23#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH) 24 25#define UTRIE_GET8_FROM_LEAD(trie, c16) \ 26 ((const uint8_t *)(trie)->data32)[ \ 27 ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ 28 ((c16)&UTRIE_MASK) \ 29 ] 30 31class TrieSet : public UObject, public UnicodeContainable { 32public: 33 TrieSet(const UnicodeSet &set, UErrorCode &errorCode) 34 : trieData(NULL), latin1(NULL), restSet(set.clone()) { 35 if(U_FAILURE(errorCode)) { 36 return; 37 } 38 if(restSet==NULL) { 39 errorCode=U_MEMORY_ALLOCATION_ERROR; 40 return; 41 } 42 43 UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE); 44 UChar32 start, end; 45 46 UnicodeSetIterator iter(set); 47 48 while(iter.nextRange() && !iter.isString()) { 49 start=iter.getCodepoint(); 50 end=iter.getCodepointEnd(); 51 if(start>0xffff) { 52 break; 53 } 54 if(end>0xffff) { 55 end=0xffff; 56 } 57 if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) { 58 errorCode=U_INTERNAL_PROGRAM_ERROR; 59 return; 60 } 61 } 62 63 // Preflight the trie length. 64 int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode); 65 if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { 66 return; 67 } 68 69 trieData=(uint32_t *)uprv_malloc(length); 70 if(trieData==NULL) { 71 errorCode=U_MEMORY_ALLOCATION_ERROR; 72 return; 73 } 74 75 errorCode=U_ZERO_ERROR; 76 utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode); 77 utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie! 78 79 if(U_SUCCESS(errorCode)) { 80 // Copy the indexes for surrogate code points into the BMP range 81 // for simple access across the entire BMP. 82 uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), 83 trie.index+UTRIE_BMP_INDEX_LENGTH, 84 (0x800>>UTRIE_SHIFT)*2); 85 latin1=UTRIE_GET8_LATIN1(&trie); 86 } 87 88 restSet.remove(0, 0xffff); 89 } 90 91 ~TrieSet() { 92 uprv_free(trieData); 93 delete restSet; 94 } 95 96 UBool contains(UChar32 c) const { 97 if((uint32_t)c<=0xff) { 98 return (UBool)latin1[c]; 99 } else if((uint32_t)c<0xffff) { 100 return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); 101 } else { 102 return restSet->contains(c); 103 } 104 } 105 106private: 107 uint32_t *trieData; 108 const uint8_t *latin1; 109 UTrie trie; 110 UnicodeSet *restSet; 111}; 112