1/*
2 ************************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uchar.h"
15#include "unicode/uniset.h"
16#include "unicode/chariter.h"
17#include "unicode/ures.h"
18#include "unicode/udata.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/uscript.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/bytestrie.h"
24#include "charstr.h"
25#include "dictionarydata.h"
26#include "uvector.h"
27#include "umutex.h"
28#include "uresimp.h"
29#include "ubrkimpl.h"
30
31U_NAMESPACE_BEGIN
32
33/*
34 ******************************************************************
35 */
36
37LanguageBreakEngine::LanguageBreakEngine() {
38}
39
40LanguageBreakEngine::~LanguageBreakEngine() {
41}
42
43/*
44 ******************************************************************
45 */
46
47LanguageBreakFactory::LanguageBreakFactory() {
48}
49
50LanguageBreakFactory::~LanguageBreakFactory() {
51}
52
53/*
54 ******************************************************************
55 */
56
57UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
58    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
59        fHandled[i] = 0;
60    }
61}
62
63UnhandledEngine::~UnhandledEngine() {
64    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
65        if (fHandled[i] != 0) {
66            delete fHandled[i];
67        }
68    }
69}
70
71UBool
72UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
73    return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
74        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
75}
76
77int32_t
78UnhandledEngine::findBreaks( UText *text,
79                                 int32_t startPos,
80                                 int32_t endPos,
81                                 UBool reverse,
82                                 int32_t breakType,
83                                 UStack &/*foundBreaks*/ ) const {
84    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
85        UChar32 c = utext_current32(text);
86        if (reverse) {
87            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
88                c = utext_previous32(text);
89            }
90        }
91        else {
92            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
93                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
94                c = utext_current32(text);
95            }
96        }
97    }
98    return 0;
99}
100
101void
102UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
103    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
104        if (fHandled[breakType] == 0) {
105            fHandled[breakType] = new UnicodeSet();
106            if (fHandled[breakType] == 0) {
107                return;
108            }
109        }
110        if (!fHandled[breakType]->contains(c)) {
111            UErrorCode status = U_ZERO_ERROR;
112            // Apply the entire script of the character.
113            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
114            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
115        }
116    }
117}
118
119/*
120 ******************************************************************
121 */
122
123ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
124    fEngines = 0;
125}
126
127ICULanguageBreakFactory::~ICULanguageBreakFactory() {
128    if (fEngines != 0) {
129        delete fEngines;
130    }
131}
132
133U_NAMESPACE_END
134U_CDECL_BEGIN
135static void U_CALLCONV _deleteEngine(void *obj) {
136    delete (const icu::LanguageBreakEngine *) obj;
137}
138U_CDECL_END
139U_NAMESPACE_BEGIN
140
141const LanguageBreakEngine *
142ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
143    UBool       needsInit;
144    int32_t     i;
145    const LanguageBreakEngine *lbe = NULL;
146    UErrorCode  status = U_ZERO_ERROR;
147
148    // TODO: The global mutex should not be used.
149    // The global mutex should only be used for short periods.
150    // A ICULanguageBreakFactory specific mutex should be used.
151    umtx_lock(NULL);
152    needsInit = (UBool)(fEngines == NULL);
153    if (!needsInit) {
154        i = fEngines->size();
155        while (--i >= 0) {
156            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
157            if (lbe != NULL && lbe->handles(c, breakType)) {
158                break;
159            }
160            lbe = NULL;
161        }
162    }
163    umtx_unlock(NULL);
164
165    if (lbe != NULL) {
166        return lbe;
167    }
168
169    if (needsInit) {
170        UStack  *engines = new UStack(_deleteEngine, NULL, status);
171        if (U_SUCCESS(status) && engines == NULL) {
172            status = U_MEMORY_ALLOCATION_ERROR;
173        }
174        else if (U_FAILURE(status)) {
175            delete engines;
176            engines = NULL;
177        }
178        else {
179            umtx_lock(NULL);
180            if (fEngines == NULL) {
181                fEngines = engines;
182                engines = NULL;
183            }
184            umtx_unlock(NULL);
185            delete engines;
186        }
187    }
188
189    if (fEngines == NULL) {
190        return NULL;
191    }
192
193    // We didn't find an engine the first time through, or there was no
194    // stack. Create an engine.
195    const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
196
197    // Now get the lock, and see if someone else has created it in the
198    // meantime
199    umtx_lock(NULL);
200    i = fEngines->size();
201    while (--i >= 0) {
202        lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
203        if (lbe != NULL && lbe->handles(c, breakType)) {
204            break;
205        }
206        lbe = NULL;
207    }
208    if (lbe == NULL && newlbe != NULL) {
209        fEngines->push((void *)newlbe, status);
210        lbe = newlbe;
211        newlbe = NULL;
212    }
213    umtx_unlock(NULL);
214
215    delete newlbe;
216
217    return lbe;
218}
219
220const LanguageBreakEngine *
221ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
222    UErrorCode status = U_ZERO_ERROR;
223    UScriptCode code = uscript_getScript(c, &status);
224    if (U_SUCCESS(status)) {
225        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
226        if (m != NULL) {
227            const LanguageBreakEngine *engine = NULL;
228            switch(code) {
229            case USCRIPT_THAI:
230                engine = new ThaiBreakEngine(m, status);
231                break;
232            case USCRIPT_LAO:
233                engine = new LaoBreakEngine(m, status);
234                break;
235            case USCRIPT_MYANMAR:
236                engine = new BurmeseBreakEngine(m, status);
237                break;
238            case USCRIPT_KHMER:
239                engine = new KhmerBreakEngine(m, status);
240                break;
241
242#if !UCONFIG_NO_NORMALIZATION
243                // CJK not available w/o normalization
244            case USCRIPT_HANGUL:
245                engine = new CjkBreakEngine(m, kKorean, status);
246                break;
247
248            // use same BreakEngine and dictionary for both Chinese and Japanese
249            case USCRIPT_HIRAGANA:
250            case USCRIPT_KATAKANA:
251            case USCRIPT_HAN:
252                engine = new CjkBreakEngine(m, kChineseJapanese, status);
253                break;
254#if 0
255            // TODO: Have to get some characters with script=common handled
256            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
257            // them to CjkBreakEngine does not work. The engine has to
258            // special-case them.
259            case USCRIPT_COMMON:
260            {
261                UBlockCode block = ublock_getCode(code);
262                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
263                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
264                break;
265            }
266#endif
267#endif
268
269            default:
270                break;
271            }
272            if (engine == NULL) {
273                delete m;
274            }
275            else if (U_FAILURE(status)) {
276                delete engine;
277                engine = NULL;
278            }
279            return engine;
280        }
281    }
282    return NULL;
283}
284
285DictionaryMatcher *
286ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
287    UErrorCode status = U_ZERO_ERROR;
288    // open root from brkitr tree.
289    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
290    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
291    int32_t dictnlength = 0;
292    const UChar *dictfname =
293        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
294    if (U_FAILURE(status)) {
295        ures_close(b);
296        return NULL;
297    }
298    CharString dictnbuf;
299    CharString ext;
300    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
301    if (extStart != NULL) {
302        int32_t len = (int32_t)(extStart - dictfname);
303        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
304        dictnlength = len;
305    }
306    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
307    ures_close(b);
308
309    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
310    if (U_SUCCESS(status)) {
311        // build trie
312        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
313        const int32_t *indexes = (const int32_t *)data;
314        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
315        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
316        DictionaryMatcher *m = NULL;
317        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
318            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
319            const char *characters = (const char *)(data + offset);
320            m = new BytesDictionaryMatcher(characters, transform, file);
321        }
322        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
323            const UChar *characters = (const UChar *)(data + offset);
324            m = new UCharsDictionaryMatcher(characters, file);
325        }
326        if (m == NULL) {
327            // no matcher exists to take ownership - either we are an invalid
328            // type or memory allocation failed
329            udata_close(file);
330        }
331        return m;
332    } else if (dictfname != NULL) {
333        // we don't have a dictionary matcher.
334        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
335        status = U_ZERO_ERROR;
336        return NULL;
337    }
338    return NULL;
339}
340
341U_NAMESPACE_END
342
343#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
344