1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Copyright (C) 2007, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: trieset.cpp 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 2007jan15 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus Scherer 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* using a UTrie with 8-bit (byte) results per code point. 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Modifies the trie index to make the BMP linear, and uses the original set 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* for supplementary code points. 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicont.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH) 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define UTRIE_GET8_FROM_LEAD(trie, c16) \ 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((const uint8_t *)(trie)->data32)[ \ 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ((c16)&UTRIE_MASK) \ 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ] 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass TrieSet : public UObject, public UnicodeContainable { 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru TrieSet(const UnicodeSet &set, UErrorCode &errorCode) 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : trieData(NULL), latin1(NULL), restSet(set.clone()) { 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_FAILURE(errorCode)) { 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(restSet==NULL) { 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE); 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 start, end; 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSetIterator iter(set); 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(iter.nextRange() && !iter.isString()) { 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru start=iter.getCodepoint(); 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru end=iter.getCodepointEnd(); 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(start>0xffff) { 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(end>0xffff) { 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru end=0xffff; 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) { 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_INTERNAL_PROGRAM_ERROR; 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Preflight the trie length. 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode); 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru trieData=(uint32_t *)uprv_malloc(length); 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(trieData==NULL) { 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_MEMORY_ALLOCATION_ERROR; 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru errorCode=U_ZERO_ERROR; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode); 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie! 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(U_SUCCESS(errorCode)) { 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Copy the indexes for surrogate code points into the BMP range 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // for simple access across the entire BMP. 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru trie.index+UTRIE_BMP_INDEX_LENGTH, 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (0x800>>UTRIE_SHIFT)*2); 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru latin1=UTRIE_GET8_LATIN1(&trie); 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru restSet.remove(0, 0xffff); 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~TrieSet() { 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_free(trieData); 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete restSet; 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool contains(UChar32 c) const { 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint32_t)c<=0xff) { 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)latin1[c]; 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)c<0xffff) { 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return restSet->contains(c); 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t *trieData; 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const uint8_t *latin1; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UTrie trie; 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *restSet; 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 112