1/* 2******************************************************************************* 3* Copyright (C) 2013, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* dictionarydata.h 7* 8* created on: 2012may31 9* created by: Markus W. Scherer & Maxime Serrano 10*/ 11 12#include "dictionarydata.h" 13#include "unicode/ucharstrie.h" 14#include "unicode/bytestrie.h" 15#include "unicode/udata.h" 16#include "cmemory.h" 17 18#if !UCONFIG_NO_BREAK_ITERATION 19 20U_NAMESPACE_BEGIN 21 22const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; 23const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; 24const int32_t DictionaryData::TRIE_TYPE_MASK = 7; 25const int32_t DictionaryData::TRIE_HAS_VALUES = 8; 26 27const int32_t DictionaryData::TRANSFORM_NONE = 0; 28const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; 29const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; 30const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; 31 32DictionaryMatcher::~DictionaryMatcher() { 33} 34 35UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { 36 udata_close(file); 37} 38 39int32_t UCharsDictionaryMatcher::getType() const { 40 return DictionaryData::TRIE_TYPE_UCHARS; 41} 42 43int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { 44 UCharsTrie uct(characters); 45 UChar32 c = utext_next32(text); 46 if (c < 0) { 47 return 0; 48 } 49 UStringTrieResult result = uct.first(c); 50 int32_t numChars = 1; 51 count = 0; 52 for (;;) { 53 if (USTRINGTRIE_HAS_VALUE(result)) { 54 if (count < limit) { 55 if (values != NULL) { 56 values[count] = uct.getValue(); 57 } 58 lengths[count++] = numChars; 59 } 60 if (result == USTRINGTRIE_FINAL_VALUE) { 61 break; 62 } 63 } 64 else if (result == USTRINGTRIE_NO_MATCH) { 65 break; 66 } 67 68 // TODO: why do we have a text limit if the UText knows its length? 69 if (numChars >= maxLength) { 70 break; 71 } 72 73 c = utext_next32(text); 74 if (c < 0) { 75 break; 76 } 77 ++numChars; 78 result = uct.next(c); 79 } 80 return numChars; 81} 82 83BytesDictionaryMatcher::~BytesDictionaryMatcher() { 84 udata_close(file); 85} 86 87UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { 88 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { 89 if (c == 0x200D) { 90 return 0xFF; 91 } else if (c == 0x200C) { 92 return 0xFE; 93 } 94 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); 95 if (delta < 0 || 0xFD < delta) { 96 return U_SENTINEL; 97 } 98 return (UChar32)delta; 99 } 100 return c; 101} 102 103int32_t BytesDictionaryMatcher::getType() const { 104 return DictionaryData::TRIE_TYPE_BYTES; 105} 106 107int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { 108 BytesTrie bt(characters); 109 UChar32 c = utext_next32(text); 110 if (c < 0) { 111 return 0; 112 } 113 UStringTrieResult result = bt.first(transform(c)); 114 int32_t numChars = 1; 115 count = 0; 116 for (;;) { 117 if (USTRINGTRIE_HAS_VALUE(result)) { 118 if (count < limit) { 119 if (values != NULL) { 120 values[count] = bt.getValue(); 121 } 122 lengths[count++] = numChars; 123 } 124 if (result == USTRINGTRIE_FINAL_VALUE) { 125 break; 126 } 127 } 128 else if (result == USTRINGTRIE_NO_MATCH) { 129 break; 130 } 131 132 // TODO: why do we have a text limit if the UText knows its length? 133 if (numChars >= maxLength) { 134 break; 135 } 136 137 c = utext_next32(text); 138 if (c < 0) { 139 break; 140 } 141 ++numChars; 142 result = bt.next(transform(c)); 143 } 144 return numChars; 145} 146 147 148U_NAMESPACE_END 149 150U_NAMESPACE_USE 151 152U_CAPI int32_t U_EXPORT2 153udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, 154 void *outData, UErrorCode *pErrorCode) { 155 const UDataInfo *pInfo; 156 int32_t headerSize; 157 const uint8_t *inBytes; 158 uint8_t *outBytes; 159 const int32_t *inIndexes; 160 int32_t indexes[DictionaryData::IX_COUNT]; 161 int32_t i, offset, size; 162 163 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 164 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; 165 pInfo = (const UDataInfo *)((const char *)inData + 4); 166 if (!(pInfo->dataFormat[0] == 0x44 && 167 pInfo->dataFormat[1] == 0x69 && 168 pInfo->dataFormat[2] == 0x63 && 169 pInfo->dataFormat[3] == 0x74 && 170 pInfo->formatVersion[0] == 1)) { 171 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", 172 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); 173 *pErrorCode = U_UNSUPPORTED_ERROR; 174 return 0; 175 } 176 177 inBytes = (const uint8_t *)inData + headerSize; 178 outBytes = (uint8_t *)outData + headerSize; 179 180 inIndexes = (const int32_t *)inBytes; 181 if (length >= 0) { 182 length -= headerSize; 183 if (length < (int32_t)(sizeof(indexes))) { 184 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); 185 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 186 return 0; 187 } 188 } 189 190 for (i = 0; i < DictionaryData::IX_COUNT; i++) { 191 indexes[i] = udata_readInt32(ds, inIndexes[i]); 192 } 193 194 size = indexes[DictionaryData::IX_TOTAL_SIZE]; 195 196 if (length >= 0) { 197 if (length < size) { 198 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); 199 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 200 return 0; 201 } 202 203 if (inBytes != outBytes) { 204 uprv_memcpy(outBytes, inBytes, size); 205 } 206 207 offset = 0; 208 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); 209 offset = (int32_t)sizeof(indexes); 210 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 211 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; 212 213 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 214 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); 215 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 216 // nothing to do 217 } else { 218 udata_printError(ds, "udict_swap(): unknown trie type!\n"); 219 *pErrorCode = U_UNSUPPORTED_ERROR; 220 return 0; 221 } 222 223 // these next two sections are empty in the current format, 224 // but may be used later. 225 offset = nextOffset; 226 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; 227 offset = nextOffset; 228 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; 229 offset = nextOffset; 230 } 231 return headerSize + size; 232} 233#endif 234