1/*
2 ************************************************************************************
3 * Copyright (C) 2006-2015, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uchar.h"
15#include "unicode/uniset.h"
16#include "unicode/chariter.h"
17#include "unicode/ures.h"
18#include "unicode/udata.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/uscript.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/bytestrie.h"
24#include "charstr.h"
25#include "dictionarydata.h"
26#include "mutex.h"
27#include "uvector.h"
28#include "umutex.h"
29#include "uresimp.h"
30#include "ubrkimpl.h"
31
32U_NAMESPACE_BEGIN
33
34/*
35 ******************************************************************
36 */
37
38LanguageBreakEngine::LanguageBreakEngine() {
39}
40
41LanguageBreakEngine::~LanguageBreakEngine() {
42}
43
44/*
45 ******************************************************************
46 */
47
48LanguageBreakFactory::LanguageBreakFactory() {
49}
50
51LanguageBreakFactory::~LanguageBreakFactory() {
52}
53
54/*
55 ******************************************************************
56 */
57
58UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
59    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
60        fHandled[i] = 0;
61    }
62}
63
64UnhandledEngine::~UnhandledEngine() {
65    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
66        if (fHandled[i] != 0) {
67            delete fHandled[i];
68        }
69    }
70}
71
72UBool
73UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
74    return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
75        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
76}
77
78int32_t
79UnhandledEngine::findBreaks( UText *text,
80                                 int32_t startPos,
81                                 int32_t endPos,
82                                 UBool reverse,
83                                 int32_t breakType,
84                                 UStack &/*foundBreaks*/ ) const {
85    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
86        UChar32 c = utext_current32(text);
87        if (reverse) {
88            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
89                c = utext_previous32(text);
90            }
91        }
92        else {
93            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
94                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
95                c = utext_current32(text);
96            }
97        }
98    }
99    return 0;
100}
101
102void
103UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
104    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
105        if (fHandled[breakType] == 0) {
106            fHandled[breakType] = new UnicodeSet();
107            if (fHandled[breakType] == 0) {
108                return;
109            }
110        }
111        if (!fHandled[breakType]->contains(c)) {
112            UErrorCode status = U_ZERO_ERROR;
113            // Apply the entire script of the character.
114            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
115            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
116        }
117    }
118}
119
120/*
121 ******************************************************************
122 */
123
124ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
125    fEngines = 0;
126}
127
128ICULanguageBreakFactory::~ICULanguageBreakFactory() {
129    if (fEngines != 0) {
130        delete fEngines;
131    }
132}
133
134U_NAMESPACE_END
135U_CDECL_BEGIN
136static void U_CALLCONV _deleteEngine(void *obj) {
137    delete (const icu::LanguageBreakEngine *) obj;
138}
139U_CDECL_END
140U_NAMESPACE_BEGIN
141
142static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
143
144const LanguageBreakEngine *
145ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
146    const LanguageBreakEngine *lbe = NULL;
147    UErrorCode  status = U_ZERO_ERROR;
148
149    Mutex m(&gBreakEngineMutex);
150
151    if (fEngines == NULL) {
152        UStack  *engines = new UStack(_deleteEngine, NULL, status);
153        if (U_FAILURE(status) || engines == NULL) {
154            // Note: no way to return error code to caller.
155            delete engines;
156            return NULL;
157        }
158        fEngines = engines;
159    } else {
160        int32_t i = fEngines->size();
161        while (--i >= 0) {
162            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
163            if (lbe != NULL && lbe->handles(c, breakType)) {
164                return lbe;
165            }
166        }
167    }
168
169    // We didn't find an engine. Create one.
170    lbe = loadEngineFor(c, breakType);
171    if (lbe != NULL) {
172        fEngines->push((void *)lbe, status);
173    }
174    return lbe;
175}
176
177const LanguageBreakEngine *
178ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
179    UErrorCode status = U_ZERO_ERROR;
180    UScriptCode code = uscript_getScript(c, &status);
181    if (U_SUCCESS(status)) {
182        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
183        if (m != NULL) {
184            const LanguageBreakEngine *engine = NULL;
185            switch(code) {
186            case USCRIPT_THAI:
187                engine = new ThaiBreakEngine(m, status);
188                break;
189            case USCRIPT_LAO:
190                engine = new LaoBreakEngine(m, status);
191                break;
192            case USCRIPT_MYANMAR:
193                engine = new BurmeseBreakEngine(m, status);
194                break;
195            case USCRIPT_KHMER:
196                engine = new KhmerBreakEngine(m, status);
197                break;
198
199#if !UCONFIG_NO_NORMALIZATION
200                // CJK not available w/o normalization
201            case USCRIPT_HANGUL:
202                engine = new CjkBreakEngine(m, kKorean, status);
203                break;
204
205            // use same BreakEngine and dictionary for both Chinese and Japanese
206            case USCRIPT_HIRAGANA:
207            case USCRIPT_KATAKANA:
208            case USCRIPT_HAN:
209                engine = new CjkBreakEngine(m, kChineseJapanese, status);
210                break;
211#if 0
212            // TODO: Have to get some characters with script=common handled
213            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
214            // them to CjkBreakEngine does not work. The engine has to
215            // special-case them.
216            case USCRIPT_COMMON:
217            {
218                UBlockCode block = ublock_getCode(code);
219                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
220                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
221                break;
222            }
223#endif
224#endif
225
226            default:
227                break;
228            }
229            if (engine == NULL) {
230                delete m;
231            }
232            else if (U_FAILURE(status)) {
233                delete engine;
234                engine = NULL;
235            }
236            return engine;
237        }
238    }
239    return NULL;
240}
241
242DictionaryMatcher *
243ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
244    UErrorCode status = U_ZERO_ERROR;
245    // open root from brkitr tree.
246    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
247    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
248    int32_t dictnlength = 0;
249    const UChar *dictfname =
250        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
251    if (U_FAILURE(status)) {
252        ures_close(b);
253        return NULL;
254    }
255    CharString dictnbuf;
256    CharString ext;
257    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
258    if (extStart != NULL) {
259        int32_t len = (int32_t)(extStart - dictfname);
260        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
261        dictnlength = len;
262    }
263    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
264    ures_close(b);
265
266    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
267    if (U_SUCCESS(status)) {
268        // build trie
269        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
270        const int32_t *indexes = (const int32_t *)data;
271        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
272        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
273        DictionaryMatcher *m = NULL;
274        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
275            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
276            const char *characters = (const char *)(data + offset);
277            m = new BytesDictionaryMatcher(characters, transform, file);
278        }
279        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
280            const UChar *characters = (const UChar *)(data + offset);
281            m = new UCharsDictionaryMatcher(characters, file);
282        }
283        if (m == NULL) {
284            // no matcher exists to take ownership - either we are an invalid
285            // type or memory allocation failed
286            udata_close(file);
287        }
288        return m;
289    } else if (dictfname != NULL) {
290        // we don't have a dictionary matcher.
291        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
292        status = U_ZERO_ERROR;
293        return NULL;
294    }
295    return NULL;
296}
297
298U_NAMESPACE_END
299
300#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
301