1/* 2******************************************************************************* 3* Copyright (C) 2012, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* dictionarydata.h 7* 8* created on: 2012may31 9* created by: Markus W. Scherer & Maxime Serrano 10*/ 11 12#include "dictionarydata.h" 13#include "unicode/ucharstrie.h" 14#include "unicode/bytestrie.h" 15#include "unicode/udata.h" 16#include "cmemory.h" 17 18#if !UCONFIG_NO_BREAK_ITERATION 19 20U_NAMESPACE_BEGIN 21 22#ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/ 23const int32_t DictionaryData::TRIE_TYPE_BYTES; 24const int32_t DictionaryData::TRIE_TYPE_UCHARS; 25#endif 26 27DictionaryMatcher::~DictionaryMatcher() { 28} 29 30UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { 31 udata_close(file); 32} 33 34int32_t UCharsDictionaryMatcher::getType() const { 35 return DictionaryData::TRIE_TYPE_UCHARS; 36} 37 38int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { 39 UCharsTrie uct(characters); 40 UChar32 c = utext_next32(text); 41 if (c < 0) { 42 return 0; 43 } 44 UStringTrieResult result = uct.first(c); 45 int32_t numChars = 1; 46 count = 0; 47 for (;;) { 48 if (USTRINGTRIE_HAS_VALUE(result)) { 49 if (count < limit) { 50 if (values != NULL) { 51 values[count] = uct.getValue(); 52 } 53 lengths[count++] = numChars; 54 } 55 if (result == USTRINGTRIE_FINAL_VALUE) { 56 break; 57 } 58 } 59 else if (result == USTRINGTRIE_NO_MATCH) { 60 break; 61 } 62 63 // TODO: why do we have a text limit if the UText knows its length? 64 if (numChars >= maxLength) { 65 break; 66 } 67 68 c = utext_next32(text); 69 if (c < 0) { 70 break; 71 } 72 ++numChars; 73 result = uct.next(c); 74 } 75 return numChars; 76} 77 78BytesDictionaryMatcher::~BytesDictionaryMatcher() { 79 udata_close(file); 80} 81 82UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { 83 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { 84 if (c == 0x200D) { 85 return 0xFF; 86 } else if (c == 0x200C) { 87 return 0xFE; 88 } 89 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); 90 if (delta < 0 || 0xFD < delta) { 91 return U_SENTINEL; 92 } 93 return (UChar32)delta; 94 } 95 return c; 96} 97 98int32_t BytesDictionaryMatcher::getType() const { 99 return DictionaryData::TRIE_TYPE_BYTES; 100} 101 102int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const { 103 BytesTrie bt(characters); 104 UChar32 c = utext_next32(text); 105 if (c < 0) { 106 return 0; 107 } 108 UStringTrieResult result = bt.first(transform(c)); 109 int32_t numChars = 1; 110 count = 0; 111 for (;;) { 112 if (USTRINGTRIE_HAS_VALUE(result)) { 113 if (count < limit) { 114 if (values != NULL) { 115 values[count] = bt.getValue(); 116 } 117 lengths[count++] = numChars; 118 } 119 if (result == USTRINGTRIE_FINAL_VALUE) { 120 break; 121 } 122 } 123 else if (result == USTRINGTRIE_NO_MATCH) { 124 break; 125 } 126 127 // TODO: why do we have a text limit if the UText knows its length? 128 if (numChars >= maxLength) { 129 break; 130 } 131 132 c = utext_next32(text); 133 if (c < 0) { 134 break; 135 } 136 ++numChars; 137 result = bt.next(transform(c)); 138 } 139 return numChars; 140} 141 142 143U_NAMESPACE_END 144 145U_NAMESPACE_USE 146 147U_CAPI int32_t U_EXPORT2 148udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, 149 void *outData, UErrorCode *pErrorCode) { 150 const UDataInfo *pInfo; 151 int32_t headerSize; 152 const uint8_t *inBytes; 153 uint8_t *outBytes; 154 const int32_t *inIndexes; 155 int32_t indexes[DictionaryData::IX_COUNT]; 156 int32_t i, offset, size; 157 158 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 159 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; 160 pInfo = (const UDataInfo *)((const char *)inData + 4); 161 if (!(pInfo->dataFormat[0] == 0x44 && 162 pInfo->dataFormat[1] == 0x69 && 163 pInfo->dataFormat[2] == 0x63 && 164 pInfo->dataFormat[3] == 0x74 && 165 pInfo->formatVersion[0] == 1)) { 166 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", 167 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); 168 *pErrorCode = U_UNSUPPORTED_ERROR; 169 return 0; 170 } 171 172 inBytes = (const uint8_t *)inData + headerSize; 173 outBytes = (uint8_t *)outData + headerSize; 174 175 inIndexes = (const int32_t *)inBytes; 176 if (length >= 0) { 177 length -= headerSize; 178 if (length < (int32_t)(sizeof(indexes))) { 179 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); 180 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 181 return 0; 182 } 183 } 184 185 for (i = 0; i < DictionaryData::IX_COUNT; i++) { 186 indexes[i] = udata_readInt32(ds, inIndexes[i]); 187 } 188 189 size = indexes[DictionaryData::IX_TOTAL_SIZE]; 190 191 if (length >= 0) { 192 if (length < size) { 193 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); 194 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 195 return 0; 196 } 197 198 if (inBytes != outBytes) { 199 uprv_memcpy(outBytes, inBytes, size); 200 } 201 202 offset = 0; 203 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); 204 offset = (int32_t)sizeof(indexes); 205 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 206 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; 207 208 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 209 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); 210 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 211 // nothing to do 212 } else { 213 udata_printError(ds, "udict_swap(): unknown trie type!\n"); 214 *pErrorCode = U_UNSUPPORTED_ERROR; 215 return 0; 216 } 217 218 // these next two sections are empty in the current format, 219 // but may be used later. 220 offset = nextOffset; 221 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; 222 offset = nextOffset; 223 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; 224 offset = nextOffset; 225 } 226 return headerSize + size; 227} 228#endif 229