1/*
2*******************************************************************************
3* Copyright (C) 2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* dictionarydata.h
7*
8* created on: 2012may31
9* created by: Markus W. Scherer & Maxime Serrano
10*/
11
12#include "dictionarydata.h"
13#include "unicode/ucharstrie.h"
14#include "unicode/bytestrie.h"
15#include "unicode/udata.h"
16#include "cmemory.h"
17
18#if !UCONFIG_NO_BREAK_ITERATION
19
20U_NAMESPACE_BEGIN
21
22const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
23const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
24const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
25const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
26
27const int32_t  DictionaryData::TRANSFORM_NONE = 0;
28const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
31
32DictionaryMatcher::~DictionaryMatcher() {
33}
34
35UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
36    udata_close(file);
37}
38
39int32_t UCharsDictionaryMatcher::getType() const {
40    return DictionaryData::TRIE_TYPE_UCHARS;
41}
42
43int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
44                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
45                            int32_t *prefix) const {
46
47    UCharsTrie uct(characters);
48    int32_t startingTextIndex = utext_getNativeIndex(text);
49    int32_t wordCount = 0;
50    int32_t codePointsMatched = 0;
51
52    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
53        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
54        int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
55        codePointsMatched += 1;
56        if (USTRINGTRIE_HAS_VALUE(result)) {
57            if (wordCount < limit) {
58                if (values != NULL) {
59                    values[wordCount] = uct.getValue();
60                }
61                if (lengths != NULL) {
62                    lengths[wordCount] = lengthMatched;
63                }
64                if (cpLengths != NULL) {
65                    cpLengths[wordCount] = codePointsMatched;
66                }
67                ++wordCount;
68            }
69            if (result == USTRINGTRIE_FINAL_VALUE) {
70                break;
71            }
72        }
73        else if (result == USTRINGTRIE_NO_MATCH) {
74            break;
75        }
76        if (lengthMatched >= maxLength) {
77            break;
78        }
79    }
80
81    if (prefix != NULL) {
82        *prefix = codePointsMatched;
83    }
84    return wordCount;
85}
86
87BytesDictionaryMatcher::~BytesDictionaryMatcher() {
88    udata_close(file);
89}
90
91UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
92    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
93        if (c == 0x200D) {
94            return 0xFF;
95        } else if (c == 0x200C) {
96            return 0xFE;
97        }
98        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
99        if (delta < 0 || 0xFD < delta) {
100            return U_SENTINEL;
101        }
102        return (UChar32)delta;
103    }
104    return c;
105}
106
107int32_t BytesDictionaryMatcher::getType() const {
108    return DictionaryData::TRIE_TYPE_BYTES;
109}
110
111int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
112                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
113                            int32_t *prefix) const {
114    BytesTrie bt(characters);
115    int32_t startingTextIndex = utext_getNativeIndex(text);
116    int32_t wordCount = 0;
117    int32_t codePointsMatched = 0;
118
119    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
120        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
121        int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
122        codePointsMatched += 1;
123        if (USTRINGTRIE_HAS_VALUE(result)) {
124            if (wordCount < limit) {
125                if (values != NULL) {
126                    values[wordCount] = bt.getValue();
127                }
128                if (lengths != NULL) {
129                    lengths[wordCount] = lengthMatched;
130                }
131                if (cpLengths != NULL) {
132                    cpLengths[wordCount] = codePointsMatched;
133                }
134                ++wordCount;
135            }
136            if (result == USTRINGTRIE_FINAL_VALUE) {
137                break;
138            }
139        }
140        else if (result == USTRINGTRIE_NO_MATCH) {
141            break;
142        }
143        if (lengthMatched >= maxLength) {
144            break;
145        }
146    }
147
148    if (prefix != NULL) {
149        *prefix = codePointsMatched;
150    }
151    return wordCount;
152}
153
154
155U_NAMESPACE_END
156
157U_NAMESPACE_USE
158
159U_CAPI int32_t U_EXPORT2
160udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
161           void *outData, UErrorCode *pErrorCode) {
162    const UDataInfo *pInfo;
163    int32_t headerSize;
164    const uint8_t *inBytes;
165    uint8_t *outBytes;
166    const int32_t *inIndexes;
167    int32_t indexes[DictionaryData::IX_COUNT];
168    int32_t i, offset, size;
169
170    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
171    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
172    pInfo = (const UDataInfo *)((const char *)inData + 4);
173    if (!(pInfo->dataFormat[0] == 0x44 &&
174          pInfo->dataFormat[1] == 0x69 &&
175          pInfo->dataFormat[2] == 0x63 &&
176          pInfo->dataFormat[3] == 0x74 &&
177          pInfo->formatVersion[0] == 1)) {
178        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
179                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
180        *pErrorCode = U_UNSUPPORTED_ERROR;
181        return 0;
182    }
183
184    inBytes = (const uint8_t *)inData + headerSize;
185    outBytes = (uint8_t *)outData + headerSize;
186
187    inIndexes = (const int32_t *)inBytes;
188    if (length >= 0) {
189        length -= headerSize;
190        if (length < (int32_t)(sizeof(indexes))) {
191            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
192            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
193            return 0;
194        }
195    }
196
197    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
198        indexes[i] = udata_readInt32(ds, inIndexes[i]);
199    }
200
201    size = indexes[DictionaryData::IX_TOTAL_SIZE];
202
203    if (length >= 0) {
204        if (length < size) {
205            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
206            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
207            return 0;
208        }
209
210        if (inBytes != outBytes) {
211            uprv_memcpy(outBytes, inBytes, size);
212        }
213
214        offset = 0;
215        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
216        offset = (int32_t)sizeof(indexes);
217        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
218        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
219
220        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
221            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
222        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
223            // nothing to do
224        } else {
225            udata_printError(ds, "udict_swap(): unknown trie type!\n");
226            *pErrorCode = U_UNSUPPORTED_ERROR;
227            return 0;
228        }
229
230        // these next two sections are empty in the current format,
231        // but may be used later.
232        offset = nextOffset;
233        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
234        offset = nextOffset;
235        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
236        offset = nextOffset;
237    }
238    return headerSize + size;
239}
240#endif
241