1/*
2*******************************************************************************
3* Copyright (C) 2013, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* dictionarydata.h
7*
8* created on: 2012may31
9* created by: Markus W. Scherer & Maxime Serrano
10*/
11
12#include "dictionarydata.h"
13#include "unicode/ucharstrie.h"
14#include "unicode/bytestrie.h"
15#include "unicode/udata.h"
16#include "cmemory.h"
17
18#if !UCONFIG_NO_BREAK_ITERATION
19
20U_NAMESPACE_BEGIN
21
22const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
23const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
24const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
25const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
26
27const int32_t  DictionaryData::TRANSFORM_NONE = 0;
28const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
31
32DictionaryMatcher::~DictionaryMatcher() {
33}
34
35UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
36    udata_close(file);
37}
38
39int32_t UCharsDictionaryMatcher::getType() const {
40    return DictionaryData::TRIE_TYPE_UCHARS;
41}
42
43int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
44    UCharsTrie uct(characters);
45    UChar32 c = utext_next32(text);
46    if (c < 0) {
47        return 0;
48    }
49    UStringTrieResult result = uct.first(c);
50    int32_t numChars = 1;
51    count = 0;
52    for (;;) {
53        if (USTRINGTRIE_HAS_VALUE(result)) {
54            if (count < limit) {
55                if (values != NULL) {
56                    values[count] = uct.getValue();
57                }
58                lengths[count++] = numChars;
59            }
60            if (result == USTRINGTRIE_FINAL_VALUE) {
61                break;
62            }
63        }
64        else if (result == USTRINGTRIE_NO_MATCH) {
65            break;
66        }
67
68        // TODO: why do we have a text limit if the UText knows its length?
69        if (numChars >= maxLength) {
70            break;
71        }
72
73        c = utext_next32(text);
74        if (c < 0) {
75            break;
76        }
77        ++numChars;
78        result = uct.next(c);
79    }
80    return numChars;
81}
82
83BytesDictionaryMatcher::~BytesDictionaryMatcher() {
84    udata_close(file);
85}
86
87UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
88    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
89        if (c == 0x200D) {
90            return 0xFF;
91        } else if (c == 0x200C) {
92            return 0xFE;
93        }
94        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
95        if (delta < 0 || 0xFD < delta) {
96            return U_SENTINEL;
97        }
98        return (UChar32)delta;
99    }
100    return c;
101}
102
103int32_t BytesDictionaryMatcher::getType() const {
104    return DictionaryData::TRIE_TYPE_BYTES;
105}
106
107int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
108    BytesTrie bt(characters);
109    UChar32 c = utext_next32(text);
110    if (c < 0) {
111        return 0;
112    }
113    UStringTrieResult result = bt.first(transform(c));
114    int32_t numChars = 1;
115    count = 0;
116    for (;;) {
117        if (USTRINGTRIE_HAS_VALUE(result)) {
118            if (count < limit) {
119                if (values != NULL) {
120                    values[count] = bt.getValue();
121            }
122                lengths[count++] = numChars;
123            }
124            if (result == USTRINGTRIE_FINAL_VALUE) {
125                break;
126            }
127        }
128        else if (result == USTRINGTRIE_NO_MATCH) {
129            break;
130        }
131
132        // TODO: why do we have a text limit if the UText knows its length?
133        if (numChars >= maxLength) {
134            break;
135        }
136
137        c = utext_next32(text);
138        if (c < 0) {
139            break;
140        }
141        ++numChars;
142        result = bt.next(transform(c));
143    }
144    return numChars;
145}
146
147
148U_NAMESPACE_END
149
150U_NAMESPACE_USE
151
152U_CAPI int32_t U_EXPORT2
153udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
154           void *outData, UErrorCode *pErrorCode) {
155    const UDataInfo *pInfo;
156    int32_t headerSize;
157    const uint8_t *inBytes;
158    uint8_t *outBytes;
159    const int32_t *inIndexes;
160    int32_t indexes[DictionaryData::IX_COUNT];
161    int32_t i, offset, size;
162
163    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
164    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
165    pInfo = (const UDataInfo *)((const char *)inData + 4);
166    if (!(pInfo->dataFormat[0] == 0x44 &&
167          pInfo->dataFormat[1] == 0x69 &&
168          pInfo->dataFormat[2] == 0x63 &&
169          pInfo->dataFormat[3] == 0x74 &&
170          pInfo->formatVersion[0] == 1)) {
171        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
172                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
173        *pErrorCode = U_UNSUPPORTED_ERROR;
174        return 0;
175    }
176
177    inBytes = (const uint8_t *)inData + headerSize;
178    outBytes = (uint8_t *)outData + headerSize;
179
180    inIndexes = (const int32_t *)inBytes;
181    if (length >= 0) {
182        length -= headerSize;
183        if (length < (int32_t)(sizeof(indexes))) {
184            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
185            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
186            return 0;
187        }
188    }
189
190    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
191        indexes[i] = udata_readInt32(ds, inIndexes[i]);
192    }
193
194    size = indexes[DictionaryData::IX_TOTAL_SIZE];
195
196    if (length >= 0) {
197        if (length < size) {
198            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
199            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
200            return 0;
201        }
202
203        if (inBytes != outBytes) {
204            uprv_memcpy(outBytes, inBytes, size);
205        }
206
207        offset = 0;
208        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
209        offset = (int32_t)sizeof(indexes);
210        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
211        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
212
213        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
214            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
215        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
216            // nothing to do
217        } else {
218            udata_printError(ds, "udict_swap(): unknown trie type!\n");
219            *pErrorCode = U_UNSUPPORTED_ERROR;
220            return 0;
221        }
222
223        // these next two sections are empty in the current format,
224        // but may be used later.
225        offset = nextOffset;
226        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
227        offset = nextOffset;
228        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
229        offset = nextOffset;
230    }
231    return headerSize + size;
232}
233#endif
234