1/**
2 *******************************************************************************
3 * Copyright (C) 2006-2015, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uniset.h"
15#include "unicode/chariter.h"
16#include "unicode/ubrk.h"
17#include "uvectr32.h"
18#include "uvector.h"
19#include "uassert.h"
20#include "unicode/normlzr.h"
21#include "cmemory.h"
22#include "dictionarydata.h"
23
24U_NAMESPACE_BEGIN
25
26/*
27 ******************************************************************
28 */
29
30DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
31    fTypes = breakTypes;
32}
33
34DictionaryBreakEngine::~DictionaryBreakEngine() {
35}
36
37UBool
38DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
39    return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
40            && fSet.contains(c));
41}
42
43int32_t
44DictionaryBreakEngine::findBreaks( UText *text,
45                                 int32_t startPos,
46                                 int32_t endPos,
47                                 UBool reverse,
48                                 int32_t breakType,
49                                 UStack &foundBreaks ) const {
50    int32_t result = 0;
51
52    // Find the span of characters included in the set.
53    //   The span to break begins at the current position in the text, and
54    //   extends towards the start or end of the text, depending on 'reverse'.
55
56    int32_t start = (int32_t)utext_getNativeIndex(text);
57    int32_t current;
58    int32_t rangeStart;
59    int32_t rangeEnd;
60    UChar32 c = utext_current32(text);
61    if (reverse) {
62        UBool   isDict = fSet.contains(c);
63        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
64            c = utext_previous32(text);
65            isDict = fSet.contains(c);
66        }
67        if (current < startPos) {
68            rangeStart = startPos;
69        } else {
70            rangeStart = current;
71            if (!isDict) {
72                utext_next32(text);
73                rangeStart = utext_getNativeIndex(text);
74            }
75        }
76        // rangeEnd = start + 1;
77        utext_setNativeIndex(text, start);
78        utext_next32(text);
79        rangeEnd = utext_getNativeIndex(text);
80    }
81    else {
82        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
83            utext_next32(text);         // TODO:  recast loop for postincrement
84            c = utext_current32(text);
85        }
86        rangeStart = start;
87        rangeEnd = current;
88    }
89    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
90        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
91        utext_setNativeIndex(text, current);
92    }
93
94    return result;
95}
96
97void
98DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
99    fSet = set;
100    // Compact for caching
101    fSet.compact();
102}
103
104/*
105 ******************************************************************
106 * PossibleWord
107 */
108
109// Helper class for improving readability of the Thai/Lao/Khmer word break
110// algorithm. The implementation is completely inline.
111
112// List size, limited by the maximum number of words in the dictionary
113// that form a nested sequence.
114static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
115
116class PossibleWord {
117private:
118    // list of word candidate lengths, in increasing length order
119    // TODO: bytes would be sufficient for word lengths.
120    int32_t   count;      // Count of candidates
121    int32_t   prefix;     // The longest match with a dictionary word
122    int32_t   offset;     // Offset in the text of these candidates
123    int32_t   mark;       // The preferred candidate's offset
124    int32_t   current;    // The candidate we're currently looking at
125    int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
126    int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
127
128public:
129    PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
130    ~PossibleWord() {};
131
132    // Fill the list of candidates if needed, select the longest, and return the number found
133    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
134
135    // Select the currently marked candidate, point after it in the text, and invalidate self
136    int32_t   acceptMarked( UText *text );
137
138    // Back up from the current candidate to the next shorter one; return TRUE if that exists
139    // and point the text after it
140    UBool     backUp( UText *text );
141
142    // Return the longest prefix this candidate location shares with a dictionary word
143    // Return value is in code points.
144    int32_t   longestPrefix() { return prefix; };
145
146    // Mark the current candidate as the one we like
147    void      markCurrent() { mark = current; };
148
149    // Get length in code points of the marked word.
150    int32_t   markedCPLength() { return cpLengths[mark]; };
151};
152
153
154int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
155    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
156    int32_t start = (int32_t)utext_getNativeIndex(text);
157    if (start != offset) {
158        offset = start;
159        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
160        // Dictionary leaves text after longest prefix, not longest word. Back up.
161        if (count <= 0) {
162            utext_setNativeIndex(text, start);
163        }
164    }
165    if (count > 0) {
166        utext_setNativeIndex(text, start+cuLengths[count-1]);
167    }
168    current = count-1;
169    mark = current;
170    return count;
171}
172
173int32_t
174PossibleWord::acceptMarked( UText *text ) {
175    utext_setNativeIndex(text, offset + cuLengths[mark]);
176    return cuLengths[mark];
177}
178
179
180UBool
181PossibleWord::backUp( UText *text ) {
182    if (current > 0) {
183        utext_setNativeIndex(text, offset + cuLengths[--current]);
184        return TRUE;
185    }
186    return FALSE;
187}
188
189/*
190 ******************************************************************
191 * ThaiBreakEngine
192 */
193
194// How many words in a row are "good enough"?
195static const int32_t THAI_LOOKAHEAD = 3;
196
197// Will not combine a non-word with a preceding dictionary word longer than this
198static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
199
200// Will not combine a non-word that shares at least this much prefix with a
201// dictionary word, with a preceding word
202static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
203
204// Ellision character
205static const int32_t THAI_PAIYANNOI = 0x0E2F;
206
207// Repeat character
208static const int32_t THAI_MAIYAMOK = 0x0E46;
209
210// Minimum word size
211static const int32_t THAI_MIN_WORD = 2;
212
213// Minimum number of characters for two words
214static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
215
216ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
217    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
218      fDictionary(adoptDictionary)
219{
220    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
221    if (U_SUCCESS(status)) {
222        setCharacters(fThaiWordSet);
223    }
224    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
225    fMarkSet.add(0x0020);
226    fEndWordSet = fThaiWordSet;
227    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
228    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
229    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
230    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
231    fSuffixSet.add(THAI_PAIYANNOI);
232    fSuffixSet.add(THAI_MAIYAMOK);
233
234    // Compact for caching.
235    fMarkSet.compact();
236    fEndWordSet.compact();
237    fBeginWordSet.compact();
238    fSuffixSet.compact();
239}
240
241ThaiBreakEngine::~ThaiBreakEngine() {
242    delete fDictionary;
243}
244
245int32_t
246ThaiBreakEngine::divideUpDictionaryRange( UText *text,
247                                                int32_t rangeStart,
248                                                int32_t rangeEnd,
249                                                UStack &foundBreaks ) const {
250    utext_setNativeIndex(text, rangeStart);
251    utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
252    if (utext_getNativeIndex(text) >= rangeEnd) {
253        return 0;       // Not enough characters for two words
254    }
255    utext_setNativeIndex(text, rangeStart);
256
257
258    uint32_t wordsFound = 0;
259    int32_t cpWordLength = 0;    // Word Length in Code Points.
260    int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
261    int32_t current;
262    UErrorCode status = U_ZERO_ERROR;
263    PossibleWord words[THAI_LOOKAHEAD];
264
265    utext_setNativeIndex(text, rangeStart);
266
267    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
268        cpWordLength = 0;
269        cuWordLength = 0;
270
271        // Look for candidate words at the current position
272        int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
273
274        // If we found exactly one, use that
275        if (candidates == 1) {
276            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
277            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
278            wordsFound += 1;
279        }
280        // If there was more than one, see which one can take us forward the most words
281        else if (candidates > 1) {
282            // If we're already at the end of the range, we're done
283            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
284                goto foundBest;
285            }
286            do {
287                int32_t wordsMatched = 1;
288                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
289                    if (wordsMatched < 2) {
290                        // Followed by another dictionary word; mark first word as a good candidate
291                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
292                        wordsMatched = 2;
293                    }
294
295                    // If we're already at the end of the range, we're done
296                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
297                        goto foundBest;
298                    }
299
300                    // See if any of the possible second words is followed by a third word
301                    do {
302                        // If we find a third word, stop right away
303                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
304                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
305                            goto foundBest;
306                        }
307                    }
308                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
309                }
310            }
311            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
312foundBest:
313            // Set UText position to after the accepted word.
314            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
315            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
316            wordsFound += 1;
317        }
318
319        // We come here after having either found a word or not. We look ahead to the
320        // next word. If it's not a dictionary word, we will combine it with the word we
321        // just found (if there is one), but only if the preceding word does not exceed
322        // the threshold.
323        // The text iterator should now be positioned at the end of the word we found.
324
325        UChar32 uc = 0;
326        if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
327            // if it is a dictionary word, do nothing. If it isn't, then if there is
328            // no preceding word, or the non-word shares less than the minimum threshold
329            // of characters with a dictionary word, then scan to resynchronize
330            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
331                  && (cuWordLength == 0
332                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
333                // Look for a plausible word boundary
334                int32_t remaining = rangeEnd - (current+cuWordLength);
335                UChar32 pc;
336                int32_t chars = 0;
337                for (;;) {
338                    int32_t pcIndex = utext_getNativeIndex(text);
339                    pc = utext_next32(text);
340                    int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
341                    chars += pcSize;
342                    remaining -= pcSize;
343                    if (remaining <= 0) {
344                        break;
345                    }
346                    uc = utext_current32(text);
347                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
348                        // Maybe. See if it's in the dictionary.
349                        // NOTE: In the original Apple code, checked that the next
350                        // two characters after uc were not 0x0E4C THANTHAKHAT before
351                        // checking the dictionary. That is just a performance filter,
352                        // but it's not clear it's faster than checking the trie.
353                        int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
354                        utext_setNativeIndex(text, current + cuWordLength + chars);
355                        if (candidates > 0) {
356                            break;
357                        }
358                    }
359                }
360
361                // Bump the word count if there wasn't already one
362                if (cuWordLength <= 0) {
363                    wordsFound += 1;
364                }
365
366                // Update the length with the passed-over characters
367                cuWordLength += chars;
368            }
369            else {
370                // Back up to where we were for next iteration
371                utext_setNativeIndex(text, current+cuWordLength);
372            }
373        }
374
375        // Never stop before a combining mark.
376        int32_t currPos;
377        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
378            utext_next32(text);
379            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
380        }
381
382        // Look ahead for possible suffixes if a dictionary word does not follow.
383        // We do this in code rather than using a rule so that the heuristic
384        // resynch continues to function. For example, one of the suffix characters
385        // could be a typo in the middle of a word.
386        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
387            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
388                && fSuffixSet.contains(uc = utext_current32(text))) {
389                if (uc == THAI_PAIYANNOI) {
390                    if (!fSuffixSet.contains(utext_previous32(text))) {
391                        // Skip over previous end and PAIYANNOI
392                        utext_next32(text);
393                        int32_t paiyannoiIndex = utext_getNativeIndex(text);
394                        utext_next32(text);
395                        cuWordLength += utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
396                        uc = utext_current32(text);     // Fetch next character
397                    }
398                    else {
399                        // Restore prior position
400                        utext_next32(text);
401                    }
402                }
403                if (uc == THAI_MAIYAMOK) {
404                    if (utext_previous32(text) != THAI_MAIYAMOK) {
405                        // Skip over previous end and MAIYAMOK
406                        utext_next32(text);
407                        int32_t maiyamokIndex = utext_getNativeIndex(text);
408                        utext_next32(text);
409                        cuWordLength += utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
410                    }
411                    else {
412                        // Restore prior position
413                        utext_next32(text);
414                    }
415                }
416            }
417            else {
418                utext_setNativeIndex(text, current+cuWordLength);
419            }
420        }
421
422        // Did we find a word on this iteration? If so, push it on the break stack
423        if (cuWordLength > 0) {
424            foundBreaks.push((current+cuWordLength), status);
425        }
426    }
427
428    // Don't return a break for the end of the dictionary range if there is one there.
429    if (foundBreaks.peeki() >= rangeEnd) {
430        (void) foundBreaks.popi();
431        wordsFound -= 1;
432    }
433
434    return wordsFound;
435}
436
437/*
438 ******************************************************************
439 * LaoBreakEngine
440 */
441
442// How many words in a row are "good enough"?
443static const int32_t LAO_LOOKAHEAD = 3;
444
445// Will not combine a non-word with a preceding dictionary word longer than this
446static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
447
448// Will not combine a non-word that shares at least this much prefix with a
449// dictionary word, with a preceding word
450static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
451
452// Minimum word size
453static const int32_t LAO_MIN_WORD = 2;
454
455// Minimum number of characters for two words
456static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
457
458LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
459    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
460      fDictionary(adoptDictionary)
461{
462    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
463    if (U_SUCCESS(status)) {
464        setCharacters(fLaoWordSet);
465    }
466    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
467    fMarkSet.add(0x0020);
468    fEndWordSet = fLaoWordSet;
469    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
470    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
471    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
472    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
473
474    // Compact for caching.
475    fMarkSet.compact();
476    fEndWordSet.compact();
477    fBeginWordSet.compact();
478}
479
480LaoBreakEngine::~LaoBreakEngine() {
481    delete fDictionary;
482}
483
484int32_t
485LaoBreakEngine::divideUpDictionaryRange( UText *text,
486                                                int32_t rangeStart,
487                                                int32_t rangeEnd,
488                                                UStack &foundBreaks ) const {
489    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
490        return 0;       // Not enough characters for two words
491    }
492
493    uint32_t wordsFound = 0;
494    int32_t cpWordLength = 0;
495    int32_t cuWordLength = 0;
496    int32_t current;
497    UErrorCode status = U_ZERO_ERROR;
498    PossibleWord words[LAO_LOOKAHEAD];
499
500    utext_setNativeIndex(text, rangeStart);
501
502    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
503        cuWordLength = 0;
504        cpWordLength = 0;
505
506        // Look for candidate words at the current position
507        int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
508
509        // If we found exactly one, use that
510        if (candidates == 1) {
511            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
512            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
513            wordsFound += 1;
514        }
515        // If there was more than one, see which one can take us forward the most words
516        else if (candidates > 1) {
517            // If we're already at the end of the range, we're done
518            if (utext_getNativeIndex(text) >= rangeEnd) {
519                goto foundBest;
520            }
521            do {
522                int32_t wordsMatched = 1;
523                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
524                    if (wordsMatched < 2) {
525                        // Followed by another dictionary word; mark first word as a good candidate
526                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
527                        wordsMatched = 2;
528                    }
529
530                    // If we're already at the end of the range, we're done
531                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
532                        goto foundBest;
533                    }
534
535                    // See if any of the possible second words is followed by a third word
536                    do {
537                        // If we find a third word, stop right away
538                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
539                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
540                            goto foundBest;
541                        }
542                    }
543                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
544                }
545            }
546            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
547foundBest:
548            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
549            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
550            wordsFound += 1;
551        }
552
553        // We come here after having either found a word or not. We look ahead to the
554        // next word. If it's not a dictionary word, we will combine it withe the word we
555        // just found (if there is one), but only if the preceding word does not exceed
556        // the threshold.
557        // The text iterator should now be positioned at the end of the word we found.
558        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
559            // if it is a dictionary word, do nothing. If it isn't, then if there is
560            // no preceding word, or the non-word shares less than the minimum threshold
561            // of characters with a dictionary word, then scan to resynchronize
562            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
563                  && (cuWordLength == 0
564                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
565                // Look for a plausible word boundary
566                int32_t remaining = rangeEnd - (current + cuWordLength);
567                UChar32 pc;
568                UChar32 uc;
569                int32_t chars = 0;
570                for (;;) {
571                    int32_t pcIndex = utext_getNativeIndex(text);
572                    pc = utext_next32(text);
573                    int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
574                    chars += pcSize;
575                    remaining -= pcSize;
576                    if (remaining <= 0) {
577                        break;
578                    }
579                    uc = utext_current32(text);
580                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
581                        // Maybe. See if it's in the dictionary.
582                        // TODO: this looks iffy; compare with old code.
583                        int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
584                        utext_setNativeIndex(text, current + cuWordLength + chars);
585                        if (candidates > 0) {
586                            break;
587                        }
588                    }
589                }
590
591                // Bump the word count if there wasn't already one
592                if (cuWordLength <= 0) {
593                    wordsFound += 1;
594                }
595
596                // Update the length with the passed-over characters
597                cuWordLength += chars;
598            }
599            else {
600                // Back up to where we were for next iteration
601                utext_setNativeIndex(text, current + cuWordLength);
602            }
603        }
604
605        // Never stop before a combining mark.
606        int32_t currPos;
607        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
608            utext_next32(text);
609            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
610        }
611
612        // Look ahead for possible suffixes if a dictionary word does not follow.
613        // We do this in code rather than using a rule so that the heuristic
614        // resynch continues to function. For example, one of the suffix characters
615        // could be a typo in the middle of a word.
616        // NOT CURRENTLY APPLICABLE TO LAO
617
618        // Did we find a word on this iteration? If so, push it on the break stack
619        if (cuWordLength > 0) {
620            foundBreaks.push((current+cuWordLength), status);
621        }
622    }
623
624    // Don't return a break for the end of the dictionary range if there is one there.
625    if (foundBreaks.peeki() >= rangeEnd) {
626        (void) foundBreaks.popi();
627        wordsFound -= 1;
628    }
629
630    return wordsFound;
631}
632
633/*
634 ******************************************************************
635 * BurmeseBreakEngine
636 */
637
638// How many words in a row are "good enough"?
639static const int32_t BURMESE_LOOKAHEAD = 3;
640
641// Will not combine a non-word with a preceding dictionary word longer than this
642static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
643
644// Will not combine a non-word that shares at least this much prefix with a
645// dictionary word, with a preceding word
646static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
647
648// Minimum word size
649static const int32_t BURMESE_MIN_WORD = 2;
650
651// Minimum number of characters for two words
652static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
653
654BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
655    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
656      fDictionary(adoptDictionary)
657{
658    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
659    if (U_SUCCESS(status)) {
660        setCharacters(fBurmeseWordSet);
661    }
662    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
663    fMarkSet.add(0x0020);
664    fEndWordSet = fBurmeseWordSet;
665    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
666
667    // Compact for caching.
668    fMarkSet.compact();
669    fEndWordSet.compact();
670    fBeginWordSet.compact();
671}
672
673BurmeseBreakEngine::~BurmeseBreakEngine() {
674    delete fDictionary;
675}
676
677int32_t
678BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
679                                                int32_t rangeStart,
680                                                int32_t rangeEnd,
681                                                UStack &foundBreaks ) const {
682    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
683        return 0;       // Not enough characters for two words
684    }
685
686    uint32_t wordsFound = 0;
687    int32_t cpWordLength = 0;
688    int32_t cuWordLength = 0;
689    int32_t current;
690    UErrorCode status = U_ZERO_ERROR;
691    PossibleWord words[BURMESE_LOOKAHEAD];
692
693    utext_setNativeIndex(text, rangeStart);
694
695    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
696        cuWordLength = 0;
697        cpWordLength = 0;
698
699        // Look for candidate words at the current position
700        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
701
702        // If we found exactly one, use that
703        if (candidates == 1) {
704            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
705            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
706            wordsFound += 1;
707        }
708        // If there was more than one, see which one can take us forward the most words
709        else if (candidates > 1) {
710            // If we're already at the end of the range, we're done
711            if (utext_getNativeIndex(text) >= rangeEnd) {
712                goto foundBest;
713            }
714            do {
715                int32_t wordsMatched = 1;
716                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
717                    if (wordsMatched < 2) {
718                        // Followed by another dictionary word; mark first word as a good candidate
719                        words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
720                        wordsMatched = 2;
721                    }
722
723                    // If we're already at the end of the range, we're done
724                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
725                        goto foundBest;
726                    }
727
728                    // See if any of the possible second words is followed by a third word
729                    do {
730                        // If we find a third word, stop right away
731                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
732                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
733                            goto foundBest;
734                        }
735                    }
736                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
737                }
738            }
739            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
740foundBest:
741            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
742            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
743            wordsFound += 1;
744        }
745
746        // We come here after having either found a word or not. We look ahead to the
747        // next word. If it's not a dictionary word, we will combine it withe the word we
748        // just found (if there is one), but only if the preceding word does not exceed
749        // the threshold.
750        // The text iterator should now be positioned at the end of the word we found.
751        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
752            // if it is a dictionary word, do nothing. If it isn't, then if there is
753            // no preceding word, or the non-word shares less than the minimum threshold
754            // of characters with a dictionary word, then scan to resynchronize
755            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
756                  && (cuWordLength == 0
757                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
758                // Look for a plausible word boundary
759                int32_t remaining = rangeEnd - (current + cuWordLength);
760                UChar32 pc;
761                UChar32 uc;
762                int32_t chars = 0;
763                for (;;) {
764                    int32_t pcIndex = utext_getNativeIndex(text);
765                    pc = utext_next32(text);
766                    int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
767                    chars += pcSize;
768                    remaining -= pcSize;
769                    if (remaining <= 0) {
770                        break;
771                    }
772                    uc = utext_current32(text);
773                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
774                        // Maybe. See if it's in the dictionary.
775                        // TODO: this looks iffy; compare with old code.
776                        int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
777                        utext_setNativeIndex(text, current + cuWordLength + chars);
778                        if (candidates > 0) {
779                            break;
780                        }
781                    }
782                }
783
784                // Bump the word count if there wasn't already one
785                if (cuWordLength <= 0) {
786                    wordsFound += 1;
787                }
788
789                // Update the length with the passed-over characters
790                cuWordLength += chars;
791            }
792            else {
793                // Back up to where we were for next iteration
794                utext_setNativeIndex(text, current + cuWordLength);
795            }
796        }
797
798        // Never stop before a combining mark.
799        int32_t currPos;
800        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
801            utext_next32(text);
802            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
803        }
804
805        // Look ahead for possible suffixes if a dictionary word does not follow.
806        // We do this in code rather than using a rule so that the heuristic
807        // resynch continues to function. For example, one of the suffix characters
808        // could be a typo in the middle of a word.
809        // NOT CURRENTLY APPLICABLE TO BURMESE
810
811        // Did we find a word on this iteration? If so, push it on the break stack
812        if (cuWordLength > 0) {
813            foundBreaks.push((current+cuWordLength), status);
814        }
815    }
816
817    // Don't return a break for the end of the dictionary range if there is one there.
818    if (foundBreaks.peeki() >= rangeEnd) {
819        (void) foundBreaks.popi();
820        wordsFound -= 1;
821    }
822
823    return wordsFound;
824}
825
826/*
827 ******************************************************************
828 * KhmerBreakEngine
829 */
830
831// How many words in a row are "good enough"?
832static const int32_t KHMER_LOOKAHEAD = 3;
833
834// Will not combine a non-word with a preceding dictionary word longer than this
835static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
836
837// Will not combine a non-word that shares at least this much prefix with a
838// dictionary word, with a preceding word
839static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
840
841// Minimum word size
842static const int32_t KHMER_MIN_WORD = 2;
843
844// Minimum number of characters for two words
845static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
846
847KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
848    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
849      fDictionary(adoptDictionary)
850{
851    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
852    if (U_SUCCESS(status)) {
853        setCharacters(fKhmerWordSet);
854    }
855    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
856    fMarkSet.add(0x0020);
857    fEndWordSet = fKhmerWordSet;
858    fBeginWordSet.add(0x1780, 0x17B3);
859    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
860    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
861    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
862    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
863    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
864//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
865//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
866//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
867//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
868//    fSuffixSet.add(THAI_PAIYANNOI);
869//    fSuffixSet.add(THAI_MAIYAMOK);
870
871    // Compact for caching.
872    fMarkSet.compact();
873    fEndWordSet.compact();
874    fBeginWordSet.compact();
875//    fSuffixSet.compact();
876}
877
878KhmerBreakEngine::~KhmerBreakEngine() {
879    delete fDictionary;
880}
881
882int32_t
883KhmerBreakEngine::divideUpDictionaryRange( UText *text,
884                                                int32_t rangeStart,
885                                                int32_t rangeEnd,
886                                                UStack &foundBreaks ) const {
887    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
888        return 0;       // Not enough characters for two words
889    }
890
891    uint32_t wordsFound = 0;
892    int32_t cpWordLength = 0;
893    int32_t cuWordLength = 0;
894    int32_t current;
895    UErrorCode status = U_ZERO_ERROR;
896    PossibleWord words[KHMER_LOOKAHEAD];
897
898    utext_setNativeIndex(text, rangeStart);
899
900    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
901        cuWordLength = 0;
902        cpWordLength = 0;
903
904        // Look for candidate words at the current position
905        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
906
907        // If we found exactly one, use that
908        if (candidates == 1) {
909            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
910            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
911            wordsFound += 1;
912        }
913
914        // If there was more than one, see which one can take us forward the most words
915        else if (candidates > 1) {
916            // If we're already at the end of the range, we're done
917            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
918                goto foundBest;
919            }
920            do {
921                int32_t wordsMatched = 1;
922                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
923                    if (wordsMatched < 2) {
924                        // Followed by another dictionary word; mark first word as a good candidate
925                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
926                        wordsMatched = 2;
927                    }
928
929                    // If we're already at the end of the range, we're done
930                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
931                        goto foundBest;
932                    }
933
934                    // See if any of the possible second words is followed by a third word
935                    do {
936                        // If we find a third word, stop right away
937                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
938                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
939                            goto foundBest;
940                        }
941                    }
942                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
943                }
944            }
945            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
946foundBest:
947            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
948            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
949            wordsFound += 1;
950        }
951
952        // We come here after having either found a word or not. We look ahead to the
953        // next word. If it's not a dictionary word, we will combine it with the word we
954        // just found (if there is one), but only if the preceding word does not exceed
955        // the threshold.
956        // The text iterator should now be positioned at the end of the word we found.
957        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
958            // if it is a dictionary word, do nothing. If it isn't, then if there is
959            // no preceding word, or the non-word shares less than the minimum threshold
960            // of characters with a dictionary word, then scan to resynchronize
961            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
962                  && (cuWordLength == 0
963                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
964                // Look for a plausible word boundary
965                int32_t remaining = rangeEnd - (current+cuWordLength);
966                UChar32 pc;
967                UChar32 uc;
968                int32_t chars = 0;
969                for (;;) {
970                    int32_t pcIndex = utext_getNativeIndex(text);
971                    pc = utext_next32(text);
972                    int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
973                    chars += pcSize;
974                    remaining -= pcSize;
975                    if (remaining <= 0) {
976                        break;
977                    }
978                    uc = utext_current32(text);
979                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
980                        // Maybe. See if it's in the dictionary.
981                        int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
982                        utext_setNativeIndex(text, current+cuWordLength+chars);
983                        if (candidates > 0) {
984                            break;
985                        }
986                    }
987                }
988
989                // Bump the word count if there wasn't already one
990                if (cuWordLength <= 0) {
991                    wordsFound += 1;
992                }
993
994                // Update the length with the passed-over characters
995                cuWordLength += chars;
996            }
997            else {
998                // Back up to where we were for next iteration
999                utext_setNativeIndex(text, current+cuWordLength);
1000            }
1001        }
1002
1003        // Never stop before a combining mark.
1004        int32_t currPos;
1005        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1006            utext_next32(text);
1007            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
1008        }
1009
1010        // Look ahead for possible suffixes if a dictionary word does not follow.
1011        // We do this in code rather than using a rule so that the heuristic
1012        // resynch continues to function. For example, one of the suffix characters
1013        // could be a typo in the middle of a word.
1014//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1015//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1016//                && fSuffixSet.contains(uc = utext_current32(text))) {
1017//                if (uc == KHMER_PAIYANNOI) {
1018//                    if (!fSuffixSet.contains(utext_previous32(text))) {
1019//                        // Skip over previous end and PAIYANNOI
1020//                        utext_next32(text);
1021//                        utext_next32(text);
1022//                        wordLength += 1;            // Add PAIYANNOI to word
1023//                        uc = utext_current32(text);     // Fetch next character
1024//                    }
1025//                    else {
1026//                        // Restore prior position
1027//                        utext_next32(text);
1028//                    }
1029//                }
1030//                if (uc == KHMER_MAIYAMOK) {
1031//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1032//                        // Skip over previous end and MAIYAMOK
1033//                        utext_next32(text);
1034//                        utext_next32(text);
1035//                        wordLength += 1;            // Add MAIYAMOK to word
1036//                    }
1037//                    else {
1038//                        // Restore prior position
1039//                        utext_next32(text);
1040//                    }
1041//                }
1042//            }
1043//            else {
1044//                utext_setNativeIndex(text, current+wordLength);
1045//            }
1046//        }
1047
1048        // Did we find a word on this iteration? If so, push it on the break stack
1049        if (cuWordLength > 0) {
1050            foundBreaks.push((current+cuWordLength), status);
1051        }
1052    }
1053
1054    // Don't return a break for the end of the dictionary range if there is one there.
1055    if (foundBreaks.peeki() >= rangeEnd) {
1056        (void) foundBreaks.popi();
1057        wordsFound -= 1;
1058    }
1059
1060    return wordsFound;
1061}
1062
1063#if !UCONFIG_NO_NORMALIZATION
1064/*
1065 ******************************************************************
1066 * CjkBreakEngine
1067 */
1068static const uint32_t kuint32max = 0xFFFFFFFF;
1069CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1070: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1071    // Korean dictionary only includes Hangul syllables
1072    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1073    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1074    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1075    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1076    nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1077
1078    if (U_SUCCESS(status)) {
1079        // handle Korean and Japanese/Chinese using different dictionaries
1080        if (type == kKorean) {
1081            setCharacters(fHangulWordSet);
1082        } else { //Chinese and Japanese
1083            UnicodeSet cjSet;
1084            cjSet.addAll(fHanWordSet);
1085            cjSet.addAll(fKatakanaWordSet);
1086            cjSet.addAll(fHiraganaWordSet);
1087            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1088            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1089            setCharacters(cjSet);
1090        }
1091    }
1092}
1093
1094CjkBreakEngine::~CjkBreakEngine(){
1095    delete fDictionary;
1096}
1097
1098// The katakanaCost values below are based on the length frequencies of all
1099// katakana phrases in the dictionary
1100static const int32_t kMaxKatakanaLength = 8;
1101static const int32_t kMaxKatakanaGroupLength = 20;
1102static const uint32_t maxSnlp = 255;
1103
1104static inline uint32_t getKatakanaCost(int32_t wordLength){
1105    //TODO: fill array with actual values from dictionary!
1106    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1107                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1108    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1109}
1110
1111static inline bool isKatakana(uint16_t value) {
1112    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1113            (value >= 0xFF66u && value <= 0xFF9fu);
1114}
1115
1116
1117// Function for accessing internal utext flags.
1118//   Replicates an internal UText function.
1119
1120static inline int32_t utext_i32_flag(int32_t bitIndex) {
1121    return (int32_t)1 << bitIndex;
1122}
1123
1124
1125/*
1126 * @param text A UText representing the text
1127 * @param rangeStart The start of the range of dictionary characters
1128 * @param rangeEnd The end of the range of dictionary characters
1129 * @param foundBreaks Output of C array of int32_t break positions, or 0
1130 * @return The number of breaks found
1131 */
1132int32_t
1133CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1134        int32_t rangeStart,
1135        int32_t rangeEnd,
1136        UStack &foundBreaks ) const {
1137    if (rangeStart >= rangeEnd) {
1138        return 0;
1139    }
1140
1141    // UnicodeString version of input UText, NFKC normalized if necessary.
1142    UnicodeString inString;
1143
1144    // inputMap[inStringIndex] = corresponding native index from UText inText.
1145    // If NULL then mapping is 1:1
1146    LocalPointer<UVector32>     inputMap;
1147
1148    UErrorCode     status      = U_ZERO_ERROR;
1149
1150
1151    // if UText has the input string as one contiguous UTF-16 chunk
1152    if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1153         inText->chunkNativeStart <= rangeStart &&
1154         inText->chunkNativeLimit >= rangeEnd   &&
1155         inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1156
1157        // Input UText is in one contiguous UTF-16 chunk.
1158        // Use Read-only aliasing UnicodeString.
1159        inString.setTo(FALSE,
1160                       inText->chunkContents + rangeStart - inText->chunkNativeStart,
1161                       rangeEnd - rangeStart);
1162    } else {
1163        // Copy the text from the original inText (UText) to inString (UnicodeString).
1164        // Create a map from UnicodeString indices -> UText offsets.
1165        utext_setNativeIndex(inText, rangeStart);
1166        int32_t limit = rangeEnd;
1167        U_ASSERT(limit <= utext_nativeLength(inText));
1168        if (limit > utext_nativeLength(inText)) {
1169            limit = utext_nativeLength(inText);
1170        }
1171        inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1172        if (U_FAILURE(status)) {
1173            return 0;
1174        }
1175        while (utext_getNativeIndex(inText) < limit) {
1176            int32_t nativePosition = utext_getNativeIndex(inText);
1177            UChar32 c = utext_next32(inText);
1178            U_ASSERT(c != U_SENTINEL);
1179            inString.append(c);
1180            while (inputMap->size() < inString.length()) {
1181                inputMap->addElement(nativePosition, status);
1182            }
1183        }
1184        inputMap->addElement(limit, status);
1185    }
1186
1187
1188    if (!nfkcNorm2->isNormalized(inString, status)) {
1189        UnicodeString normalizedInput;
1190        //  normalizedMap[normalizedInput position] ==  original UText position.
1191        LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1192        if (U_FAILURE(status)) {
1193            return 0;
1194        }
1195
1196        UnicodeString fragment;
1197        UnicodeString normalizedFragment;
1198        for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1199            fragment.remove();
1200            int32_t fragmentStartI = srcI;
1201            UChar32 c = inString.char32At(srcI);
1202            for (;;) {
1203                fragment.append(c);
1204                srcI = inString.moveIndex32(srcI, 1);
1205                if (srcI == inString.length()) {
1206                    break;
1207                }
1208                c = inString.char32At(srcI);
1209                if (nfkcNorm2->hasBoundaryBefore(c)) {
1210                    break;
1211                }
1212            }
1213            nfkcNorm2->normalize(fragment, normalizedFragment, status);
1214            normalizedInput.append(normalizedFragment);
1215
1216            // Map every position in the normalized chunk to the start of the chunk
1217            //   in the original input.
1218            int32_t fragmentOriginalStart = inputMap.isValid() ?
1219                    inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1220            while (normalizedMap->size() < normalizedInput.length()) {
1221                normalizedMap->addElement(fragmentOriginalStart, status);
1222                if (U_FAILURE(status)) {
1223                    break;
1224                }
1225            }
1226        }
1227        U_ASSERT(normalizedMap->size() == normalizedInput.length());
1228        int32_t nativeEnd = inputMap.isValid() ?
1229                inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1230        normalizedMap->addElement(nativeEnd, status);
1231
1232        inputMap.moveFrom(normalizedMap);
1233        inString.moveFrom(normalizedInput);
1234    }
1235
1236    int32_t numCodePts = inString.countChar32();
1237    if (numCodePts != inString.length()) {
1238        // There are supplementary characters in the input.
1239        // The dictionary will produce boundary positions in terms of code point indexes,
1240        //   not in terms of code unit string indexes.
1241        // Use the inputMap mechanism to take care of this in addition to indexing differences
1242        //    from normalization and/or UTF-8 input.
1243        UBool hadExistingMap = inputMap.isValid();
1244        if (!hadExistingMap) {
1245            inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1246            if (U_FAILURE(status)) {
1247                return 0;
1248            }
1249        }
1250        int32_t cpIdx = 0;
1251        for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1252            U_ASSERT(cuIdx >= cpIdx);
1253            if (hadExistingMap) {
1254                inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1255            } else {
1256                inputMap->addElement(cuIdx+rangeStart, status);
1257            }
1258            cpIdx++;
1259            if (cuIdx == inString.length()) {
1260               break;
1261            }
1262        }
1263    }
1264
1265    // bestSnlp[i] is the snlp of the best segmentation of the first i
1266    // code points in the range to be matched.
1267    UVector32 bestSnlp(numCodePts + 1, status);
1268    bestSnlp.addElement(0, status);
1269    for(int32_t i = 1; i <= numCodePts; i++) {
1270        bestSnlp.addElement(kuint32max, status);
1271    }
1272
1273
1274    // prev[i] is the index of the last CJK code point in the previous word in
1275    // the best segmentation of the first i characters.
1276    UVector32 prev(numCodePts + 1, status);
1277    for(int32_t i = 0; i <= numCodePts; i++){
1278        prev.addElement(-1, status);
1279    }
1280
1281    const int32_t maxWordSize = 20;
1282    UVector32 values(numCodePts, status);
1283    values.setSize(numCodePts);
1284    UVector32 lengths(numCodePts, status);
1285    lengths.setSize(numCodePts);
1286
1287    UText fu = UTEXT_INITIALIZER;
1288    utext_openUnicodeString(&fu, &inString, &status);
1289
1290    // Dynamic programming to find the best segmentation.
1291
1292    // In outer loop, i  is the code point index,
1293    //                ix is the corresponding string (code unit) index.
1294    //    They differ when the string contains supplementary characters.
1295    int32_t ix = 0;
1296    for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1297        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1298            continue;
1299        }
1300
1301        int32_t count;
1302        utext_setNativeIndex(&fu, ix);
1303        count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1304                             NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1305                             // Note: lengths is filled with code point lengths
1306                             //       The NULL parameter is the ignored code unit lengths.
1307
1308        // if there are no single character matches found in the dictionary
1309        // starting with this charcter, treat character as a 1-character word
1310        // with the highest value possible, i.e. the least likely to occur.
1311        // Exclude Korean characters from this treatment, as they should be left
1312        // together by default.
1313        if ((count == 0 || lengths.elementAti(0) != 1) &&
1314                !fHangulWordSet.contains(inString.char32At(ix))) {
1315            values.setElementAt(maxSnlp, count);   // 255
1316            lengths.setElementAt(1, count++);
1317        }
1318
1319        for (int32_t j = 0; j < count; j++) {
1320            uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1321            int32_t ln_j_i = lengths.elementAti(j) + i;
1322            if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1323                bestSnlp.setElementAt(newSnlp, ln_j_i);
1324                prev.setElementAt(i, ln_j_i);
1325            }
1326        }
1327
1328        // In Japanese,
1329        // Katakana word in single character is pretty rare. So we apply
1330        // the following heuristic to Katakana: any continuous run of Katakana
1331        // characters is considered a candidate word with a default cost
1332        // specified in the katakanaCost table according to its length.
1333
1334        bool is_prev_katakana = false;
1335        bool is_katakana = isKatakana(inString.char32At(ix));
1336        int32_t katakanaRunLength = 1;
1337        if (!is_prev_katakana && is_katakana) {
1338            int32_t j = inString.moveIndex32(ix, 1);
1339            // Find the end of the continuous run of Katakana characters
1340            while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1341                    isKatakana(inString.char32At(j))) {
1342                j = inString.moveIndex32(j, 1);
1343                katakanaRunLength++;
1344            }
1345            if (katakanaRunLength < kMaxKatakanaGroupLength) {
1346                uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1347                if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1348                    bestSnlp.setElementAt(newSnlp, j);
1349                    prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1350                }
1351            }
1352        }
1353        is_prev_katakana = is_katakana;
1354    }
1355    utext_close(&fu);
1356
1357    // Start pushing the optimal offset index into t_boundary (t for tentative).
1358    // prev[numCodePts] is guaranteed to be meaningful.
1359    // We'll first push in the reverse order, i.e.,
1360    // t_boundary[0] = numCodePts, and afterwards do a swap.
1361    UVector32 t_boundary(numCodePts+1, status);
1362
1363    int32_t numBreaks = 0;
1364    // No segmentation found, set boundary to end of range
1365    if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1366        t_boundary.addElement(numCodePts, status);
1367        numBreaks++;
1368    } else {
1369        for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1370            t_boundary.addElement(i, status);
1371            numBreaks++;
1372        }
1373        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1374    }
1375
1376    // Add a break for the start of the dictionary range if there is not one
1377    // there already.
1378    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1379        t_boundary.addElement(0, status);
1380        numBreaks++;
1381    }
1382
1383    // Now that we're done, convert positions in t_boundary[] (indices in
1384    // the normalized input string) back to indices in the original input UText
1385    // while reversing t_boundary and pushing values to foundBreaks.
1386    for (int32_t i = numBreaks-1; i >= 0; i--) {
1387        int32_t cpPos = t_boundary.elementAti(i);
1388        int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1389        // Boundaries are added to foundBreaks output in ascending order.
1390        U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
1391        foundBreaks.push(utextPos, status);
1392    }
1393
1394    // inString goes out of scope
1395    // inputMap goes out of scope
1396    return numBreaks;
1397}
1398#endif
1399
1400U_NAMESPACE_END
1401
1402#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1403
1404