1/*
2 ************************************************************************************
3 * Copyright (C) 2006-2013, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uchar.h"
15#include "unicode/uniset.h"
16#include "unicode/chariter.h"
17#include "unicode/ures.h"
18#include "unicode/udata.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/uscript.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/bytestrie.h"
24#include "charstr.h"
25#include "dictionarydata.h"
26#include "uvector.h"
27#include "umutex.h"
28#include "uresimp.h"
29#include "ubrkimpl.h"
30
31U_NAMESPACE_BEGIN
32
33/*
34 ******************************************************************
35 */
36
37LanguageBreakEngine::LanguageBreakEngine() {
38}
39
40LanguageBreakEngine::~LanguageBreakEngine() {
41}
42
43/*
44 ******************************************************************
45 */
46
47LanguageBreakFactory::LanguageBreakFactory() {
48}
49
50LanguageBreakFactory::~LanguageBreakFactory() {
51}
52
53/*
54 ******************************************************************
55 */
56
57UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
58    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
59        fHandled[i] = 0;
60    }
61}
62
63UnhandledEngine::~UnhandledEngine() {
64    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
65        if (fHandled[i] != 0) {
66            delete fHandled[i];
67        }
68    }
69}
70
71UBool
72UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
73    return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
74        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
75}
76
77int32_t
78UnhandledEngine::findBreaks( UText *text,
79                                 int32_t startPos,
80                                 int32_t endPos,
81                                 UBool reverse,
82                                 int32_t breakType,
83                                 UStack &/*foundBreaks*/ ) const {
84    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
85        UChar32 c = utext_current32(text);
86        if (reverse) {
87            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
88                c = utext_previous32(text);
89            }
90        }
91        else {
92            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
93                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
94                c = utext_current32(text);
95            }
96        }
97    }
98    return 0;
99}
100
101void
102UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
103    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
104        if (fHandled[breakType] == 0) {
105            fHandled[breakType] = new UnicodeSet();
106            if (fHandled[breakType] == 0) {
107                return;
108            }
109        }
110        if (!fHandled[breakType]->contains(c)) {
111            UErrorCode status = U_ZERO_ERROR;
112            // Apply the entire script of the character.
113            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
114            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
115        }
116    }
117}
118
119/*
120 ******************************************************************
121 */
122
123ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
124    fEngines = 0;
125}
126
127ICULanguageBreakFactory::~ICULanguageBreakFactory() {
128    if (fEngines != 0) {
129        delete fEngines;
130    }
131}
132
133U_NAMESPACE_END
134U_CDECL_BEGIN
135static void U_CALLCONV _deleteEngine(void *obj) {
136    delete (const icu::LanguageBreakEngine *) obj;
137}
138U_CDECL_END
139U_NAMESPACE_BEGIN
140
141const LanguageBreakEngine *
142ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
143    UBool       needsInit;
144    int32_t     i;
145    const LanguageBreakEngine *lbe = NULL;
146    UErrorCode  status = U_ZERO_ERROR;
147
148    // TODO: The global mutex should not be used.
149    // The global mutex should only be used for short periods.
150    // A ICULanguageBreakFactory specific mutex should be used.
151    umtx_lock(NULL);
152    needsInit = (UBool)(fEngines == NULL);
153    if (!needsInit) {
154        i = fEngines->size();
155        while (--i >= 0) {
156            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
157            if (lbe != NULL && lbe->handles(c, breakType)) {
158                break;
159            }
160            lbe = NULL;
161        }
162    }
163    umtx_unlock(NULL);
164
165    if (lbe != NULL) {
166        return lbe;
167    }
168
169    if (needsInit) {
170        UStack  *engines = new UStack(_deleteEngine, NULL, status);
171        if (U_SUCCESS(status) && engines == NULL) {
172            status = U_MEMORY_ALLOCATION_ERROR;
173        }
174        else if (U_FAILURE(status)) {
175            delete engines;
176            engines = NULL;
177        }
178        else {
179            umtx_lock(NULL);
180            if (fEngines == NULL) {
181                fEngines = engines;
182                engines = NULL;
183            }
184            umtx_unlock(NULL);
185            delete engines;
186        }
187    }
188
189    if (fEngines == NULL) {
190        return NULL;
191    }
192
193    // We didn't find an engine the first time through, or there was no
194    // stack. Create an engine.
195    const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
196
197    // Now get the lock, and see if someone else has created it in the
198    // meantime
199    umtx_lock(NULL);
200    i = fEngines->size();
201    while (--i >= 0) {
202        lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
203        if (lbe != NULL && lbe->handles(c, breakType)) {
204            break;
205        }
206        lbe = NULL;
207    }
208    if (lbe == NULL && newlbe != NULL) {
209        fEngines->push((void *)newlbe, status);
210        lbe = newlbe;
211        newlbe = NULL;
212    }
213    umtx_unlock(NULL);
214
215    delete newlbe;
216
217    return lbe;
218}
219
220const LanguageBreakEngine *
221ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
222    UErrorCode status = U_ZERO_ERROR;
223    UScriptCode code = uscript_getScript(c, &status);
224    if (U_SUCCESS(status)) {
225        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
226        if (m != NULL) {
227            const LanguageBreakEngine *engine = NULL;
228            switch(code) {
229            case USCRIPT_THAI:
230                engine = new ThaiBreakEngine(m, status);
231                break;
232            case USCRIPT_LAO:
233                engine = new LaoBreakEngine(m, status);
234                break;
235            case USCRIPT_KHMER:
236                engine = new KhmerBreakEngine(m, status);
237                break;
238
239#if !UCONFIG_NO_NORMALIZATION
240                // CJK not available w/o normalization
241            case USCRIPT_HANGUL:
242                engine = new CjkBreakEngine(m, kKorean, status);
243                break;
244
245            // use same BreakEngine and dictionary for both Chinese and Japanese
246            case USCRIPT_HIRAGANA:
247            case USCRIPT_KATAKANA:
248            case USCRIPT_HAN:
249                engine = new CjkBreakEngine(m, kChineseJapanese, status);
250                break;
251#if 0
252            // TODO: Have to get some characters with script=common handled
253            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
254            // them to CjkBreakEngine does not work. The engine has to
255            // special-case them.
256            case USCRIPT_COMMON:
257            {
258                UBlockCode block = ublock_getCode(code);
259                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
260                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
261                break;
262            }
263#endif
264#endif
265
266            default:
267                break;
268            }
269            if (engine == NULL) {
270                delete m;
271            }
272            else if (U_FAILURE(status)) {
273                delete engine;
274                engine = NULL;
275            }
276            return engine;
277        }
278    }
279    return NULL;
280}
281
282DictionaryMatcher *
283ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
284    UErrorCode status = U_ZERO_ERROR;
285    // open root from brkitr tree.
286    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
287    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
288    int32_t dictnlength = 0;
289    const UChar *dictfname =
290        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
291    if (U_FAILURE(status)) {
292        ures_close(b);
293        return NULL;
294    }
295    CharString dictnbuf;
296    CharString ext;
297    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
298    if (extStart != NULL) {
299        int32_t len = (int32_t)(extStart - dictfname);
300        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
301        dictnlength = len;
302    }
303    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
304    ures_close(b);
305
306    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
307    if (U_SUCCESS(status)) {
308        // build trie
309        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
310        const int32_t *indexes = (const int32_t *)data;
311        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
312        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
313        DictionaryMatcher *m = NULL;
314        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
315            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
316            const char *characters = (const char *)(data + offset);
317            m = new BytesDictionaryMatcher(characters, transform, file);
318        }
319        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
320            const UChar *characters = (const UChar *)(data + offset);
321            m = new UCharsDictionaryMatcher(characters, file);
322        }
323        if (m == NULL) {
324            // no matcher exists to take ownership - either we are an invalid
325            // type or memory allocation failed
326            udata_close(file);
327        }
328        return m;
329    } else if (dictfname != NULL) {
330        // we don't have a dictionary matcher.
331        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
332        status = U_ZERO_ERROR;
333        return NULL;
334    }
335    return NULL;
336}
337
338U_NAMESPACE_END
339
340#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
341