1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2014-2016, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* dictionarydata.h
9*
10* created on: 2012may31
11* created by: Markus W. Scherer & Maxime Serrano
12*/
13
14#include "dictionarydata.h"
15#include "unicode/ucharstrie.h"
16#include "unicode/bytestrie.h"
17#include "unicode/udata.h"
18#include "cmemory.h"
19
20#if !UCONFIG_NO_BREAK_ITERATION
21
22U_NAMESPACE_BEGIN
23
24const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
25const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
26const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
27const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
28
29const int32_t  DictionaryData::TRANSFORM_NONE = 0;
30const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
31const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
32const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
33
34DictionaryMatcher::~DictionaryMatcher() {
35}
36
37UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38    udata_close(file);
39}
40
41int32_t UCharsDictionaryMatcher::getType() const {
42    return DictionaryData::TRIE_TYPE_UCHARS;
43}
44
45int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
47                            int32_t *prefix) const {
48
49    UCharsTrie uct(characters);
50    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
51    int32_t wordCount = 0;
52    int32_t codePointsMatched = 0;
53
54    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
55        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
56        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
57        codePointsMatched += 1;
58        if (USTRINGTRIE_HAS_VALUE(result)) {
59            if (wordCount < limit) {
60                if (values != NULL) {
61                    values[wordCount] = uct.getValue();
62                }
63                if (lengths != NULL) {
64                    lengths[wordCount] = lengthMatched;
65                }
66                if (cpLengths != NULL) {
67                    cpLengths[wordCount] = codePointsMatched;
68                }
69                ++wordCount;
70            }
71            if (result == USTRINGTRIE_FINAL_VALUE) {
72                break;
73            }
74        }
75        else if (result == USTRINGTRIE_NO_MATCH) {
76            break;
77        }
78        if (lengthMatched >= maxLength) {
79            break;
80        }
81    }
82
83    if (prefix != NULL) {
84        *prefix = codePointsMatched;
85    }
86    return wordCount;
87}
88
89BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90    udata_close(file);
91}
92
93UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95        if (c == 0x200D) {
96            return 0xFF;
97        } else if (c == 0x200C) {
98            return 0xFE;
99        }
100        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101        if (delta < 0 || 0xFD < delta) {
102            return U_SENTINEL;
103        }
104        return (UChar32)delta;
105    }
106    return c;
107}
108
109int32_t BytesDictionaryMatcher::getType() const {
110    return DictionaryData::TRIE_TYPE_BYTES;
111}
112
113int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
115                            int32_t *prefix) const {
116    BytesTrie bt(characters);
117    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
118    int32_t wordCount = 0;
119    int32_t codePointsMatched = 0;
120
121    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
122        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
123        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
124        codePointsMatched += 1;
125        if (USTRINGTRIE_HAS_VALUE(result)) {
126            if (wordCount < limit) {
127                if (values != NULL) {
128                    values[wordCount] = bt.getValue();
129                }
130                if (lengths != NULL) {
131                    lengths[wordCount] = lengthMatched;
132                }
133                if (cpLengths != NULL) {
134                    cpLengths[wordCount] = codePointsMatched;
135                }
136                ++wordCount;
137            }
138            if (result == USTRINGTRIE_FINAL_VALUE) {
139                break;
140            }
141        }
142        else if (result == USTRINGTRIE_NO_MATCH) {
143            break;
144        }
145        if (lengthMatched >= maxLength) {
146            break;
147        }
148    }
149
150    if (prefix != NULL) {
151        *prefix = codePointsMatched;
152    }
153    return wordCount;
154}
155
156
157U_NAMESPACE_END
158
159U_NAMESPACE_USE
160
161U_CAPI int32_t U_EXPORT2
162udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
163           void *outData, UErrorCode *pErrorCode) {
164    const UDataInfo *pInfo;
165    int32_t headerSize;
166    const uint8_t *inBytes;
167    uint8_t *outBytes;
168    const int32_t *inIndexes;
169    int32_t indexes[DictionaryData::IX_COUNT];
170    int32_t i, offset, size;
171
172    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
174    pInfo = (const UDataInfo *)((const char *)inData + 4);
175    if (!(pInfo->dataFormat[0] == 0x44 &&
176          pInfo->dataFormat[1] == 0x69 &&
177          pInfo->dataFormat[2] == 0x63 &&
178          pInfo->dataFormat[3] == 0x74 &&
179          pInfo->formatVersion[0] == 1)) {
180        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
182        *pErrorCode = U_UNSUPPORTED_ERROR;
183        return 0;
184    }
185
186    inBytes = (const uint8_t *)inData + headerSize;
187    outBytes = (uint8_t *)outData + headerSize;
188
189    inIndexes = (const int32_t *)inBytes;
190    if (length >= 0) {
191        length -= headerSize;
192        if (length < (int32_t)(sizeof(indexes))) {
193            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195            return 0;
196        }
197    }
198
199    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
200        indexes[i] = udata_readInt32(ds, inIndexes[i]);
201    }
202
203    size = indexes[DictionaryData::IX_TOTAL_SIZE];
204
205    if (length >= 0) {
206        if (length < size) {
207            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209            return 0;
210        }
211
212        if (inBytes != outBytes) {
213            uprv_memcpy(outBytes, inBytes, size);
214        }
215
216        offset = 0;
217        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218        offset = (int32_t)sizeof(indexes);
219        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225            // nothing to do
226        } else {
227            udata_printError(ds, "udict_swap(): unknown trie type!\n");
228            *pErrorCode = U_UNSUPPORTED_ERROR;
229            return 0;
230        }
231
232        // these next two sections are empty in the current format,
233        // but may be used later.
234        offset = nextOffset;
235        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236        offset = nextOffset;
237        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238        offset = nextOffset;
239    }
240    return headerSize + size;
241}
242#endif
243