1/* 2******************************************************************************* 3* Copyright (C) 2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* dictionarydata.h 7* 8* created on: 2012may31 9* created by: Markus W. Scherer & Maxime Serrano 10*/ 11 12#include "dictionarydata.h" 13#include "unicode/ucharstrie.h" 14#include "unicode/bytestrie.h" 15#include "unicode/udata.h" 16#include "cmemory.h" 17 18#if !UCONFIG_NO_BREAK_ITERATION 19 20U_NAMESPACE_BEGIN 21 22const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; 23const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; 24const int32_t DictionaryData::TRIE_TYPE_MASK = 7; 25const int32_t DictionaryData::TRIE_HAS_VALUES = 8; 26 27const int32_t DictionaryData::TRANSFORM_NONE = 0; 28const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; 29const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; 30const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; 31 32DictionaryMatcher::~DictionaryMatcher() { 33} 34 35UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { 36 udata_close(file); 37} 38 39int32_t UCharsDictionaryMatcher::getType() const { 40 return DictionaryData::TRIE_TYPE_UCHARS; 41} 42 43int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, 44 int32_t *lengths, int32_t *cpLengths, int32_t *values, 45 int32_t *prefix) const { 46 47 UCharsTrie uct(characters); 48 int32_t startingTextIndex = utext_getNativeIndex(text); 49 int32_t wordCount = 0; 50 int32_t codePointsMatched = 0; 51 52 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { 53 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); 54 int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex; 55 codePointsMatched += 1; 56 if (USTRINGTRIE_HAS_VALUE(result)) { 57 if (wordCount < limit) { 58 if (values != NULL) { 59 values[wordCount] = uct.getValue(); 60 } 61 if (lengths != NULL) { 62 lengths[wordCount] = lengthMatched; 63 } 64 if (cpLengths != NULL) { 65 cpLengths[wordCount] = codePointsMatched; 66 } 67 ++wordCount; 68 } 69 if (result == USTRINGTRIE_FINAL_VALUE) { 70 break; 71 } 72 } 73 else if (result == USTRINGTRIE_NO_MATCH) { 74 break; 75 } 76 if (lengthMatched >= maxLength) { 77 break; 78 } 79 } 80 81 if (prefix != NULL) { 82 *prefix = codePointsMatched; 83 } 84 return wordCount; 85} 86 87BytesDictionaryMatcher::~BytesDictionaryMatcher() { 88 udata_close(file); 89} 90 91UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { 92 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { 93 if (c == 0x200D) { 94 return 0xFF; 95 } else if (c == 0x200C) { 96 return 0xFE; 97 } 98 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); 99 if (delta < 0 || 0xFD < delta) { 100 return U_SENTINEL; 101 } 102 return (UChar32)delta; 103 } 104 return c; 105} 106 107int32_t BytesDictionaryMatcher::getType() const { 108 return DictionaryData::TRIE_TYPE_BYTES; 109} 110 111int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, 112 int32_t *lengths, int32_t *cpLengths, int32_t *values, 113 int32_t *prefix) const { 114 BytesTrie bt(characters); 115 int32_t startingTextIndex = utext_getNativeIndex(text); 116 int32_t wordCount = 0; 117 int32_t codePointsMatched = 0; 118 119 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { 120 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); 121 int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex; 122 codePointsMatched += 1; 123 if (USTRINGTRIE_HAS_VALUE(result)) { 124 if (wordCount < limit) { 125 if (values != NULL) { 126 values[wordCount] = bt.getValue(); 127 } 128 if (lengths != NULL) { 129 lengths[wordCount] = lengthMatched; 130 } 131 if (cpLengths != NULL) { 132 cpLengths[wordCount] = codePointsMatched; 133 } 134 ++wordCount; 135 } 136 if (result == USTRINGTRIE_FINAL_VALUE) { 137 break; 138 } 139 } 140 else if (result == USTRINGTRIE_NO_MATCH) { 141 break; 142 } 143 if (lengthMatched >= maxLength) { 144 break; 145 } 146 } 147 148 if (prefix != NULL) { 149 *prefix = codePointsMatched; 150 } 151 return wordCount; 152} 153 154 155U_NAMESPACE_END 156 157U_NAMESPACE_USE 158 159U_CAPI int32_t U_EXPORT2 160udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, 161 void *outData, UErrorCode *pErrorCode) { 162 const UDataInfo *pInfo; 163 int32_t headerSize; 164 const uint8_t *inBytes; 165 uint8_t *outBytes; 166 const int32_t *inIndexes; 167 int32_t indexes[DictionaryData::IX_COUNT]; 168 int32_t i, offset, size; 169 170 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 171 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; 172 pInfo = (const UDataInfo *)((const char *)inData + 4); 173 if (!(pInfo->dataFormat[0] == 0x44 && 174 pInfo->dataFormat[1] == 0x69 && 175 pInfo->dataFormat[2] == 0x63 && 176 pInfo->dataFormat[3] == 0x74 && 177 pInfo->formatVersion[0] == 1)) { 178 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", 179 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); 180 *pErrorCode = U_UNSUPPORTED_ERROR; 181 return 0; 182 } 183 184 inBytes = (const uint8_t *)inData + headerSize; 185 outBytes = (uint8_t *)outData + headerSize; 186 187 inIndexes = (const int32_t *)inBytes; 188 if (length >= 0) { 189 length -= headerSize; 190 if (length < (int32_t)(sizeof(indexes))) { 191 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); 192 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 193 return 0; 194 } 195 } 196 197 for (i = 0; i < DictionaryData::IX_COUNT; i++) { 198 indexes[i] = udata_readInt32(ds, inIndexes[i]); 199 } 200 201 size = indexes[DictionaryData::IX_TOTAL_SIZE]; 202 203 if (length >= 0) { 204 if (length < size) { 205 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); 206 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 207 return 0; 208 } 209 210 if (inBytes != outBytes) { 211 uprv_memcpy(outBytes, inBytes, size); 212 } 213 214 offset = 0; 215 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); 216 offset = (int32_t)sizeof(indexes); 217 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 218 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; 219 220 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 221 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); 222 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 223 // nothing to do 224 } else { 225 udata_printError(ds, "udict_swap(): unknown trie type!\n"); 226 *pErrorCode = U_UNSUPPORTED_ERROR; 227 return 0; 228 } 229 230 // these next two sections are empty in the current format, 231 // but may be used later. 232 offset = nextOffset; 233 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; 234 offset = nextOffset; 235 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; 236 offset = nextOffset; 237 } 238 return headerSize + size; 239} 240#endif 241