1/*
2 ************************************************************************************
3 * Copyright (C) 2006-2012, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uchar.h"
15#include "unicode/uniset.h"
16#include "unicode/chariter.h"
17#include "unicode/ures.h"
18#include "unicode/udata.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/uscript.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/bytestrie.h"
24#include "charstr.h"
25#include "dictionarydata.h"
26#include "uvector.h"
27#include "umutex.h"
28#include "uresimp.h"
29#include "ubrkimpl.h"
30
31U_NAMESPACE_BEGIN
32
33/*
34 ******************************************************************
35 */
36
37LanguageBreakEngine::LanguageBreakEngine() {
38}
39
40LanguageBreakEngine::~LanguageBreakEngine() {
41}
42
43/*
44 ******************************************************************
45 */
46
47LanguageBreakFactory::LanguageBreakFactory() {
48}
49
50LanguageBreakFactory::~LanguageBreakFactory() {
51}
52
53/*
54 ******************************************************************
55 */
56
57UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
58    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
59        fHandled[i] = 0;
60    }
61}
62
63UnhandledEngine::~UnhandledEngine() {
64    for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
65        if (fHandled[i] != 0) {
66            delete fHandled[i];
67        }
68    }
69}
70
71UBool
72UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
73    return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
74        && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
75}
76
77int32_t
78UnhandledEngine::findBreaks( UText *text,
79                                 int32_t startPos,
80                                 int32_t endPos,
81                                 UBool reverse,
82                                 int32_t breakType,
83                                 UStack &/*foundBreaks*/ ) const {
84    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
85        UChar32 c = utext_current32(text);
86        if (reverse) {
87            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
88                c = utext_previous32(text);
89            }
90        }
91        else {
92            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
93                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
94                c = utext_current32(text);
95            }
96        }
97    }
98    return 0;
99}
100
101void
102UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
103    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
104        if (fHandled[breakType] == 0) {
105            fHandled[breakType] = new UnicodeSet();
106            if (fHandled[breakType] == 0) {
107                return;
108            }
109        }
110        if (!fHandled[breakType]->contains(c)) {
111            UErrorCode status = U_ZERO_ERROR;
112            // Apply the entire script of the character.
113            int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
114            fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
115        }
116    }
117}
118
119/*
120 ******************************************************************
121 */
122
123ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
124    fEngines = 0;
125}
126
127ICULanguageBreakFactory::~ICULanguageBreakFactory() {
128    if (fEngines != 0) {
129        delete fEngines;
130    }
131}
132
133U_NAMESPACE_END
134U_CDECL_BEGIN
135static void U_CALLCONV _deleteEngine(void *obj) {
136    delete (const icu::LanguageBreakEngine *) obj;
137}
138U_CDECL_END
139U_NAMESPACE_BEGIN
140
141const LanguageBreakEngine *
142ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
143    UBool       needsInit;
144    int32_t     i;
145    const LanguageBreakEngine *lbe = NULL;
146    UErrorCode  status = U_ZERO_ERROR;
147
148    // TODO: The global mutex should not be used.
149    // The global mutex should only be used for short periods.
150    // A ICULanguageBreakFactory specific mutex should be used.
151    umtx_lock(NULL);
152    needsInit = (UBool)(fEngines == NULL);
153    if (!needsInit) {
154        i = fEngines->size();
155        while (--i >= 0) {
156            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
157            if (lbe != NULL && lbe->handles(c, breakType)) {
158                break;
159            }
160            lbe = NULL;
161        }
162    }
163    umtx_unlock(NULL);
164
165    if (lbe != NULL) {
166        return lbe;
167    }
168
169    if (needsInit) {
170        UStack  *engines = new UStack(_deleteEngine, NULL, status);
171        if (U_SUCCESS(status) && engines == NULL) {
172            status = U_MEMORY_ALLOCATION_ERROR;
173        }
174        else if (U_FAILURE(status)) {
175            delete engines;
176            engines = NULL;
177        }
178        else {
179            umtx_lock(NULL);
180            if (fEngines == NULL) {
181                fEngines = engines;
182                engines = NULL;
183            }
184            umtx_unlock(NULL);
185            delete engines;
186        }
187    }
188
189    if (fEngines == NULL) {
190        return NULL;
191    }
192
193    // We didn't find an engine the first time through, or there was no
194    // stack. Create an engine.
195    const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
196
197    // Now get the lock, and see if someone else has created it in the
198    // meantime
199    umtx_lock(NULL);
200    i = fEngines->size();
201    while (--i >= 0) {
202        lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
203        if (lbe != NULL && lbe->handles(c, breakType)) {
204            break;
205        }
206        lbe = NULL;
207    }
208    if (lbe == NULL && newlbe != NULL) {
209        fEngines->push((void *)newlbe, status);
210        lbe = newlbe;
211        newlbe = NULL;
212    }
213    umtx_unlock(NULL);
214
215    delete newlbe;
216
217    return lbe;
218}
219
220const LanguageBreakEngine *
221ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
222    UErrorCode status = U_ZERO_ERROR;
223    UScriptCode code = uscript_getScript(c, &status);
224    if (U_SUCCESS(status)) {
225        DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
226        if (m != NULL) {
227            const LanguageBreakEngine *engine = NULL;
228            switch(code) {
229            case USCRIPT_THAI:
230                engine = new ThaiBreakEngine(m, status);
231                break;
232            case USCRIPT_KHMER:
233                engine = new KhmerBreakEngine(m, status);
234                break;
235
236#if !UCONFIG_NO_NORMALIZATION
237                // CJK not available w/o normalization
238            case USCRIPT_HANGUL:
239                engine = new CjkBreakEngine(m, kKorean, status);
240                break;
241
242            // use same BreakEngine and dictionary for both Chinese and Japanese
243            case USCRIPT_HIRAGANA:
244            case USCRIPT_KATAKANA:
245            case USCRIPT_HAN:
246                engine = new CjkBreakEngine(m, kChineseJapanese, status);
247                break;
248#if 0
249            // TODO: Have to get some characters with script=common handled
250            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
251            // them to CjkBreakEngine does not work. The engine has to
252            // special-case them.
253            case USCRIPT_COMMON:
254            {
255                UBlockCode block = ublock_getCode(code);
256                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
257                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
258                break;
259            }
260#endif
261#endif
262
263            default:
264                break;
265            }
266            if (engine == NULL) {
267                delete m;
268            }
269            else if (U_FAILURE(status)) {
270                delete engine;
271                engine = NULL;
272            }
273            return engine;
274        }
275    }
276    return NULL;
277}
278
279DictionaryMatcher *
280ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
281    UErrorCode status = U_ZERO_ERROR;
282    // open root from brkitr tree.
283    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
284    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
285    int32_t dictnlength = 0;
286    const UChar *dictfname =
287        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
288    if (U_FAILURE(status)) {
289        ures_close(b);
290        return NULL;
291    }
292    CharString dictnbuf;
293    CharString ext;
294    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
295    if (extStart != NULL) {
296        int32_t len = (int32_t)(extStart - dictfname);
297        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
298        dictnlength = len;
299    }
300    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
301    ures_close(b);
302
303    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
304    if (U_SUCCESS(status)) {
305        // build trie
306        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
307        const int32_t *indexes = (const int32_t *)data;
308        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
309        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
310        DictionaryMatcher *m = NULL;
311        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
312            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
313            const char *characters = (const char *)(data + offset);
314            m = new BytesDictionaryMatcher(characters, transform, file);
315        }
316        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
317            const UChar *characters = (const UChar *)(data + offset);
318            m = new UCharsDictionaryMatcher(characters, file);
319        }
320        if (m == NULL) {
321            // no matcher exists to take ownership - either we are an invalid
322            // type or memory allocation failed
323            udata_close(file);
324        }
325        return m;
326    } else if (dictfname != NULL) {
327        // we don't have a dictionary matcher.
328        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
329        status = U_ZERO_ERROR;
330        return NULL;
331    }
332    return NULL;
333}
334
335U_NAMESPACE_END
336
337#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
338