1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Copyright (C) 2006-2014, International Business Machines Corporation
454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * and others. All Rights Reserved.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "brkeng.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "dictbe.h"
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/chariter.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ubrk.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h"
1854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "uassert.h"
1954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/normlzr.h"
2054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "cmemory.h"
2154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "dictionarydata.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fTypes = breakTypes;
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::~DictionaryBreakEngine() {
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            && fSet.contains(c));
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::findBreaks( UText *text,
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t startPos,
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t endPos,
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 UBool reverse,
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t breakType,
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 UStack &foundBreaks ) const {
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t result = 0;
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Find the span of characters included in the set.
52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    //   The span to break begins at the current position in the text, and
53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    //   extends towards the start or end of the text, depending on 'reverse'.
54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t start = (int32_t)utext_getNativeIndex(text);
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t current;
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t rangeStart;
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t rangeEnd;
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c = utext_current32(text);
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (reverse) {
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UBool   isDict = fSet.contains(c);
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c = utext_previous32(text);
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            isDict = fSet.contains(c);
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeEnd = start + 1;
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else {
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utext_next32(text);         // TODO:  recast loop for postincrement
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c = utext_current32(text);
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeStart = start;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeEnd = current;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utext_setNativeIndex(text, current);
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return result;
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSet = set;
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Compact for caching
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSet.compact();
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************
9459d709d503bab6e2b61931737e662dd293b40578ccornelius * PossibleWord
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
9759d709d503bab6e2b61931737e662dd293b40578ccornelius// Helper class for improving readability of the Thai/Lao/Khmer word break
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// algorithm. The implementation is completely inline.
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// List size, limited by the maximum number of words in the dictionary
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that form a nested sequence.
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define POSSIBLE_WORD_LIST_MAX 20
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass PossibleWord {
10554dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusprivate:
10654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // list of word candidate lengths, in increasing length order
10754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
10854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t   count;      // Count of candidates
10954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t   prefix;     // The longest match with a dictionary word
11054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t   offset;     // Offset in the text of these candidates
11154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int       mark;       // The preferred candidate's offset
11254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int       current;    // The candidate we're currently looking at
11354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
11454dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic:
11554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    PossibleWord();
11654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    ~PossibleWord();
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
11854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Fill the list of candidates if needed, select the longest, and return the number found
11954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int       candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Select the currently marked candidate, point after it in the text, and invalidate self
12254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t   acceptMarked( UText *text );
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Back up from the current candidate to the next shorter one; return TRUE if that exists
12554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // and point the text after it
12654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool     backUp( UText *text );
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Return the longest prefix this candidate location shares with a dictionary word
12954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int32_t   longestPrefix();
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
13154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Mark the current candidate as the one we like
13254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    void      markCurrent();
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::PossibleWord() {
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    offset = -1;
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::~PossibleWord() {
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int
14554dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusPossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t start = (int32_t)utext_getNativeIndex(text);
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (start != offset) {
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offset = start;
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Dictionary leaves text after longest prefix, not longest word. Back up.
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (count <= 0) {
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utext_setNativeIndex(text, start);
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (count > 0) {
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utext_setNativeIndex(text, start+lengths[count-1]);
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    current = count-1;
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    mark = current;
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return count;
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::acceptMarked( UText *text ) {
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utext_setNativeIndex(text, offset + lengths[mark]);
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return lengths[mark];
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline UBool
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::backUp( UText *text ) {
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (current > 0) {
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utext_setNativeIndex(text, offset + lengths[--current]);
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return TRUE;
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return FALSE;
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::longestPrefix() {
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return prefix;
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline void
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::markCurrent() {
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    mark = current;
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
18959d709d503bab6e2b61931737e662dd293b40578ccornelius/*
19059d709d503bab6e2b61931737e662dd293b40578ccornelius ******************************************************************
19159d709d503bab6e2b61931737e662dd293b40578ccornelius * ThaiBreakEngine
19259d709d503bab6e2b61931737e662dd293b40578ccornelius */
19359d709d503bab6e2b61931737e662dd293b40578ccornelius
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// How many words in a row are "good enough"?
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_LOOKAHEAD 3
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word with a preceding dictionary word longer than this
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_ROOT_COMBINE_THRESHOLD 3
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word that shares at least this much prefix with a
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// dictionary word, with a preceding word
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PREFIX_COMBINE_THRESHOLD 3
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Ellision character
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PAIYANNOI 0x0E2F
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Repeat character
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MAIYAMOK 0x0E46
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum word size
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD 2
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum number of characters for two words
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
21654dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      fDictionary(adoptDictionary)
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_SUCCESS(status)) {
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        setCharacters(fThaiWordSet);
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
22585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    fMarkSet.add(0x0020);
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet = fThaiWordSet;
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSuffixSet.add(THAI_PAIYANNOI);
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSuffixSet.add(THAI_MAIYAMOK);
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Compact for caching.
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fMarkSet.compact();
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet.compact();
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBeginWordSet.compact();
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSuffixSet.compact();
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::~ThaiBreakEngine() {
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fDictionary;
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::divideUpDictionaryRange( UText *text,
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                int32_t rangeStart,
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                int32_t rangeEnd,
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                UStack &foundBreaks ) const {
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;       // Not enough characters for two words
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t wordsFound = 0;
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t wordLength;
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t current;
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    PossibleWord words[THAI_LOOKAHEAD];
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 uc;
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utext_setNativeIndex(text, rangeStart);
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        wordLength = 0;
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Look for candidate words at the current position
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // If we found exactly one, use that
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (candidates == 1) {
27154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordsFound += 1;
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // If there was more than one, see which one can take us forward the most words
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        else if (candidates > 1) {
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // If we're already at the end of the range, we're done
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                goto foundBest;
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            do {
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int wordsMatched = 1;
28254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (wordsMatched < 2) {
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Followed by another dictionary word; mark first word as a good candidate
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        wordsMatched = 2;
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // If we're already at the end of the range, we're done
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        goto foundBest;
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // See if any of the possible second words is followed by a third word
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    do {
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // If we find a third word, stop right away
29754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
29854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            goto foundBest;
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
30254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
30554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerufoundBest:
30754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordsFound += 1;
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // We come here after having either found a word or not. We look ahead to the
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // next word. If it's not a dictionary word, we will combine it withe the word we
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // just found (if there is one), but only if the preceding word does not exceed
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // the threshold.
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The text iterator should now be positioned at the end of the word we found.
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // if it is a dictionary word, do nothing. If it isn't, then if there is
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // no preceding word, or the non-word shares less than the minimum threshold
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // of characters with a dictionary word, then scan to resynchronize
32054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                  && (wordLength == 0
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Look for a plausible word boundary
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                //TODO: This section will need a rework for UText.
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t remaining = rangeEnd - (current+wordLength);
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UChar32 pc = utext_current32(text);
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t chars = 0;
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for (;;) {
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    utext_next32(text);
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uc = utext_current32(text);
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // TODO: Here we're counting on the fact that the SA languages are all
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // in the BMP. This should get fixed with the UText rework.
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    chars += 1;
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (--remaining <= 0) {
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Maybe. See if it's in the dictionary.
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // NOTE: In the original Apple code, checked that the next
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // two characters after uc were not 0x0E4C THANTHAKHAT before
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // checking the dictionary. That is just a performance filter,
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // but it's not clear it's faster than checking the trie.
34354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                        int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
34454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                        utext_setNativeIndex(text, current + wordLength + chars);
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if (candidates > 0) {
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            break;
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pc = uc;
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Bump the word count if there wasn't already one
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (wordLength <= 0) {
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    wordsFound += 1;
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Update the length with the passed-over characters
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                wordLength += chars;
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else {
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Back up to where we were for next iteration
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                utext_setNativeIndex(text, current+wordLength);
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Never stop before a combining mark.
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t currPos;
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utext_next32(text);
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Look ahead for possible suffixes if a dictionary word does not follow.
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // We do this in code rather than using a rule so that the heuristic
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // resynch continues to function. For example, one of the suffix characters
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // could be a typo in the middle of a word.
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                && fSuffixSet.contains(uc = utext_current32(text))) {
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (uc == THAI_PAIYANNOI) {
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (!fSuffixSet.contains(utext_previous32(text))) {
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Skip over previous end and PAIYANNOI
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        wordLength += 1;            // Add PAIYANNOI to word
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        uc = utext_current32(text);     // Fetch next character
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    else {
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Restore prior position
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (uc == THAI_MAIYAMOK) {
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (utext_previous32(text) != THAI_MAIYAMOK) {
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Skip over previous end and MAIYAMOK
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        wordLength += 1;            // Add MAIYAMOK to word
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    else {
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Restore prior position
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else {
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                utext_setNativeIndex(text, current+wordLength);
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Did we find a word on this iteration? If so, push it on the break stack
412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if (wordLength > 0) {
413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            foundBreaks.push((current+wordLength), status);
414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Don't return a break for the end of the dictionary range if there is one there.
418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (foundBreaks.peeki() >= rangeEnd) {
419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        (void) foundBreaks.popi();
420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        wordsFound -= 1;
421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return wordsFound;
424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
42659d709d503bab6e2b61931737e662dd293b40578ccornelius/*
42759d709d503bab6e2b61931737e662dd293b40578ccornelius ******************************************************************
42859d709d503bab6e2b61931737e662dd293b40578ccornelius * LaoBreakEngine
42959d709d503bab6e2b61931737e662dd293b40578ccornelius */
43059d709d503bab6e2b61931737e662dd293b40578ccornelius
43159d709d503bab6e2b61931737e662dd293b40578ccornelius// How many words in a row are "good enough"?
43259d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_LOOKAHEAD 3
43359d709d503bab6e2b61931737e662dd293b40578ccornelius
43459d709d503bab6e2b61931737e662dd293b40578ccornelius// Will not combine a non-word with a preceding dictionary word longer than this
43559d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_ROOT_COMBINE_THRESHOLD 3
43659d709d503bab6e2b61931737e662dd293b40578ccornelius
43759d709d503bab6e2b61931737e662dd293b40578ccornelius// Will not combine a non-word that shares at least this much prefix with a
43859d709d503bab6e2b61931737e662dd293b40578ccornelius// dictionary word, with a preceding word
43959d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_PREFIX_COMBINE_THRESHOLD 3
44059d709d503bab6e2b61931737e662dd293b40578ccornelius
44159d709d503bab6e2b61931737e662dd293b40578ccornelius// Minimum word size
44259d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_MIN_WORD 2
44359d709d503bab6e2b61931737e662dd293b40578ccornelius
44459d709d503bab6e2b61931737e662dd293b40578ccornelius// Minimum number of characters for two words
44559d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
44659d709d503bab6e2b61931737e662dd293b40578ccornelius
44759d709d503bab6e2b61931737e662dd293b40578ccorneliusLaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
44859d709d503bab6e2b61931737e662dd293b40578ccornelius    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
44959d709d503bab6e2b61931737e662dd293b40578ccornelius      fDictionary(adoptDictionary)
45059d709d503bab6e2b61931737e662dd293b40578ccornelius{
45159d709d503bab6e2b61931737e662dd293b40578ccornelius    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
45259d709d503bab6e2b61931737e662dd293b40578ccornelius    if (U_SUCCESS(status)) {
45359d709d503bab6e2b61931737e662dd293b40578ccornelius        setCharacters(fLaoWordSet);
45459d709d503bab6e2b61931737e662dd293b40578ccornelius    }
45559d709d503bab6e2b61931737e662dd293b40578ccornelius    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
45659d709d503bab6e2b61931737e662dd293b40578ccornelius    fMarkSet.add(0x0020);
45759d709d503bab6e2b61931737e662dd293b40578ccornelius    fEndWordSet = fLaoWordSet;
45859d709d503bab6e2b61931737e662dd293b40578ccornelius    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
45959d709d503bab6e2b61931737e662dd293b40578ccornelius    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
46059d709d503bab6e2b61931737e662dd293b40578ccornelius    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
46159d709d503bab6e2b61931737e662dd293b40578ccornelius    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
46259d709d503bab6e2b61931737e662dd293b40578ccornelius
46359d709d503bab6e2b61931737e662dd293b40578ccornelius    // Compact for caching.
46459d709d503bab6e2b61931737e662dd293b40578ccornelius    fMarkSet.compact();
46559d709d503bab6e2b61931737e662dd293b40578ccornelius    fEndWordSet.compact();
46659d709d503bab6e2b61931737e662dd293b40578ccornelius    fBeginWordSet.compact();
46759d709d503bab6e2b61931737e662dd293b40578ccornelius}
46859d709d503bab6e2b61931737e662dd293b40578ccornelius
46959d709d503bab6e2b61931737e662dd293b40578ccorneliusLaoBreakEngine::~LaoBreakEngine() {
47059d709d503bab6e2b61931737e662dd293b40578ccornelius    delete fDictionary;
47159d709d503bab6e2b61931737e662dd293b40578ccornelius}
47259d709d503bab6e2b61931737e662dd293b40578ccornelius
47359d709d503bab6e2b61931737e662dd293b40578ccorneliusint32_t
47459d709d503bab6e2b61931737e662dd293b40578ccorneliusLaoBreakEngine::divideUpDictionaryRange( UText *text,
47559d709d503bab6e2b61931737e662dd293b40578ccornelius                                                int32_t rangeStart,
47659d709d503bab6e2b61931737e662dd293b40578ccornelius                                                int32_t rangeEnd,
47759d709d503bab6e2b61931737e662dd293b40578ccornelius                                                UStack &foundBreaks ) const {
47859d709d503bab6e2b61931737e662dd293b40578ccornelius    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
47959d709d503bab6e2b61931737e662dd293b40578ccornelius        return 0;       // Not enough characters for two words
48059d709d503bab6e2b61931737e662dd293b40578ccornelius    }
48159d709d503bab6e2b61931737e662dd293b40578ccornelius
48259d709d503bab6e2b61931737e662dd293b40578ccornelius    uint32_t wordsFound = 0;
48359d709d503bab6e2b61931737e662dd293b40578ccornelius    int32_t wordLength;
48459d709d503bab6e2b61931737e662dd293b40578ccornelius    int32_t current;
48559d709d503bab6e2b61931737e662dd293b40578ccornelius    UErrorCode status = U_ZERO_ERROR;
48659d709d503bab6e2b61931737e662dd293b40578ccornelius    PossibleWord words[LAO_LOOKAHEAD];
48759d709d503bab6e2b61931737e662dd293b40578ccornelius    UChar32 uc;
48859d709d503bab6e2b61931737e662dd293b40578ccornelius
48959d709d503bab6e2b61931737e662dd293b40578ccornelius    utext_setNativeIndex(text, rangeStart);
49059d709d503bab6e2b61931737e662dd293b40578ccornelius
49159d709d503bab6e2b61931737e662dd293b40578ccornelius    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
49259d709d503bab6e2b61931737e662dd293b40578ccornelius        wordLength = 0;
49359d709d503bab6e2b61931737e662dd293b40578ccornelius
49459d709d503bab6e2b61931737e662dd293b40578ccornelius        // Look for candidate words at the current position
49559d709d503bab6e2b61931737e662dd293b40578ccornelius        int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
49659d709d503bab6e2b61931737e662dd293b40578ccornelius
49759d709d503bab6e2b61931737e662dd293b40578ccornelius        // If we found exactly one, use that
49859d709d503bab6e2b61931737e662dd293b40578ccornelius        if (candidates == 1) {
49959d709d503bab6e2b61931737e662dd293b40578ccornelius            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
50059d709d503bab6e2b61931737e662dd293b40578ccornelius            wordsFound += 1;
50159d709d503bab6e2b61931737e662dd293b40578ccornelius        }
50259d709d503bab6e2b61931737e662dd293b40578ccornelius        // If there was more than one, see which one can take us forward the most words
50359d709d503bab6e2b61931737e662dd293b40578ccornelius        else if (candidates > 1) {
50459d709d503bab6e2b61931737e662dd293b40578ccornelius            // If we're already at the end of the range, we're done
50559d709d503bab6e2b61931737e662dd293b40578ccornelius            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
50659d709d503bab6e2b61931737e662dd293b40578ccornelius                goto foundBest;
50759d709d503bab6e2b61931737e662dd293b40578ccornelius            }
50859d709d503bab6e2b61931737e662dd293b40578ccornelius            do {
50959d709d503bab6e2b61931737e662dd293b40578ccornelius                int wordsMatched = 1;
51059d709d503bab6e2b61931737e662dd293b40578ccornelius                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
51159d709d503bab6e2b61931737e662dd293b40578ccornelius                    if (wordsMatched < 2) {
51259d709d503bab6e2b61931737e662dd293b40578ccornelius                        // Followed by another dictionary word; mark first word as a good candidate
51359d709d503bab6e2b61931737e662dd293b40578ccornelius                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
51459d709d503bab6e2b61931737e662dd293b40578ccornelius                        wordsMatched = 2;
51559d709d503bab6e2b61931737e662dd293b40578ccornelius                    }
51659d709d503bab6e2b61931737e662dd293b40578ccornelius
51759d709d503bab6e2b61931737e662dd293b40578ccornelius                    // If we're already at the end of the range, we're done
51859d709d503bab6e2b61931737e662dd293b40578ccornelius                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
51959d709d503bab6e2b61931737e662dd293b40578ccornelius                        goto foundBest;
52059d709d503bab6e2b61931737e662dd293b40578ccornelius                    }
52159d709d503bab6e2b61931737e662dd293b40578ccornelius
52259d709d503bab6e2b61931737e662dd293b40578ccornelius                    // See if any of the possible second words is followed by a third word
52359d709d503bab6e2b61931737e662dd293b40578ccornelius                    do {
52459d709d503bab6e2b61931737e662dd293b40578ccornelius                        // If we find a third word, stop right away
52559d709d503bab6e2b61931737e662dd293b40578ccornelius                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
52659d709d503bab6e2b61931737e662dd293b40578ccornelius                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
52759d709d503bab6e2b61931737e662dd293b40578ccornelius                            goto foundBest;
52859d709d503bab6e2b61931737e662dd293b40578ccornelius                        }
52959d709d503bab6e2b61931737e662dd293b40578ccornelius                    }
53059d709d503bab6e2b61931737e662dd293b40578ccornelius                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
53159d709d503bab6e2b61931737e662dd293b40578ccornelius                }
53259d709d503bab6e2b61931737e662dd293b40578ccornelius            }
53359d709d503bab6e2b61931737e662dd293b40578ccornelius            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
53459d709d503bab6e2b61931737e662dd293b40578ccorneliusfoundBest:
53559d709d503bab6e2b61931737e662dd293b40578ccornelius            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
53659d709d503bab6e2b61931737e662dd293b40578ccornelius            wordsFound += 1;
53759d709d503bab6e2b61931737e662dd293b40578ccornelius        }
53859d709d503bab6e2b61931737e662dd293b40578ccornelius
53959d709d503bab6e2b61931737e662dd293b40578ccornelius        // We come here after having either found a word or not. We look ahead to the
54059d709d503bab6e2b61931737e662dd293b40578ccornelius        // next word. If it's not a dictionary word, we will combine it withe the word we
54159d709d503bab6e2b61931737e662dd293b40578ccornelius        // just found (if there is one), but only if the preceding word does not exceed
54259d709d503bab6e2b61931737e662dd293b40578ccornelius        // the threshold.
54359d709d503bab6e2b61931737e662dd293b40578ccornelius        // The text iterator should now be positioned at the end of the word we found.
54459d709d503bab6e2b61931737e662dd293b40578ccornelius        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
54559d709d503bab6e2b61931737e662dd293b40578ccornelius            // if it is a dictionary word, do nothing. If it isn't, then if there is
54659d709d503bab6e2b61931737e662dd293b40578ccornelius            // no preceding word, or the non-word shares less than the minimum threshold
54759d709d503bab6e2b61931737e662dd293b40578ccornelius            // of characters with a dictionary word, then scan to resynchronize
54859d709d503bab6e2b61931737e662dd293b40578ccornelius            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
54959d709d503bab6e2b61931737e662dd293b40578ccornelius                  && (wordLength == 0
55059d709d503bab6e2b61931737e662dd293b40578ccornelius                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
55159d709d503bab6e2b61931737e662dd293b40578ccornelius                // Look for a plausible word boundary
55259d709d503bab6e2b61931737e662dd293b40578ccornelius                //TODO: This section will need a rework for UText.
55359d709d503bab6e2b61931737e662dd293b40578ccornelius                int32_t remaining = rangeEnd - (current+wordLength);
55459d709d503bab6e2b61931737e662dd293b40578ccornelius                UChar32 pc = utext_current32(text);
55559d709d503bab6e2b61931737e662dd293b40578ccornelius                int32_t chars = 0;
55659d709d503bab6e2b61931737e662dd293b40578ccornelius                for (;;) {
55759d709d503bab6e2b61931737e662dd293b40578ccornelius                    utext_next32(text);
55859d709d503bab6e2b61931737e662dd293b40578ccornelius                    uc = utext_current32(text);
55959d709d503bab6e2b61931737e662dd293b40578ccornelius                    // TODO: Here we're counting on the fact that the SA languages are all
56059d709d503bab6e2b61931737e662dd293b40578ccornelius                    // in the BMP. This should get fixed with the UText rework.
56159d709d503bab6e2b61931737e662dd293b40578ccornelius                    chars += 1;
56259d709d503bab6e2b61931737e662dd293b40578ccornelius                    if (--remaining <= 0) {
56359d709d503bab6e2b61931737e662dd293b40578ccornelius                        break;
56459d709d503bab6e2b61931737e662dd293b40578ccornelius                    }
56559d709d503bab6e2b61931737e662dd293b40578ccornelius                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
56659d709d503bab6e2b61931737e662dd293b40578ccornelius                        // Maybe. See if it's in the dictionary.
56759d709d503bab6e2b61931737e662dd293b40578ccornelius                        int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
56859d709d503bab6e2b61931737e662dd293b40578ccornelius                        utext_setNativeIndex(text, current + wordLength + chars);
56959d709d503bab6e2b61931737e662dd293b40578ccornelius                        if (candidates > 0) {
57059d709d503bab6e2b61931737e662dd293b40578ccornelius                            break;
57159d709d503bab6e2b61931737e662dd293b40578ccornelius                        }
57259d709d503bab6e2b61931737e662dd293b40578ccornelius                    }
57359d709d503bab6e2b61931737e662dd293b40578ccornelius                    pc = uc;
57459d709d503bab6e2b61931737e662dd293b40578ccornelius                }
57559d709d503bab6e2b61931737e662dd293b40578ccornelius
57659d709d503bab6e2b61931737e662dd293b40578ccornelius                // Bump the word count if there wasn't already one
57759d709d503bab6e2b61931737e662dd293b40578ccornelius                if (wordLength <= 0) {
57859d709d503bab6e2b61931737e662dd293b40578ccornelius                    wordsFound += 1;
57959d709d503bab6e2b61931737e662dd293b40578ccornelius                }
58059d709d503bab6e2b61931737e662dd293b40578ccornelius
58159d709d503bab6e2b61931737e662dd293b40578ccornelius                // Update the length with the passed-over characters
58259d709d503bab6e2b61931737e662dd293b40578ccornelius                wordLength += chars;
58359d709d503bab6e2b61931737e662dd293b40578ccornelius            }
58459d709d503bab6e2b61931737e662dd293b40578ccornelius            else {
58559d709d503bab6e2b61931737e662dd293b40578ccornelius                // Back up to where we were for next iteration
58659d709d503bab6e2b61931737e662dd293b40578ccornelius                utext_setNativeIndex(text, current+wordLength);
58759d709d503bab6e2b61931737e662dd293b40578ccornelius            }
58859d709d503bab6e2b61931737e662dd293b40578ccornelius        }
58959d709d503bab6e2b61931737e662dd293b40578ccornelius
59059d709d503bab6e2b61931737e662dd293b40578ccornelius        // Never stop before a combining mark.
59159d709d503bab6e2b61931737e662dd293b40578ccornelius        int32_t currPos;
59259d709d503bab6e2b61931737e662dd293b40578ccornelius        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
59359d709d503bab6e2b61931737e662dd293b40578ccornelius            utext_next32(text);
59459d709d503bab6e2b61931737e662dd293b40578ccornelius            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
59559d709d503bab6e2b61931737e662dd293b40578ccornelius        }
59659d709d503bab6e2b61931737e662dd293b40578ccornelius
59759d709d503bab6e2b61931737e662dd293b40578ccornelius        // Look ahead for possible suffixes if a dictionary word does not follow.
59859d709d503bab6e2b61931737e662dd293b40578ccornelius        // We do this in code rather than using a rule so that the heuristic
59959d709d503bab6e2b61931737e662dd293b40578ccornelius        // resynch continues to function. For example, one of the suffix characters
60059d709d503bab6e2b61931737e662dd293b40578ccornelius        // could be a typo in the middle of a word.
60159d709d503bab6e2b61931737e662dd293b40578ccornelius        // NOT CURRENTLY APPLICABLE TO LAO
60259d709d503bab6e2b61931737e662dd293b40578ccornelius
60359d709d503bab6e2b61931737e662dd293b40578ccornelius        // Did we find a word on this iteration? If so, push it on the break stack
60459d709d503bab6e2b61931737e662dd293b40578ccornelius        if (wordLength > 0) {
60559d709d503bab6e2b61931737e662dd293b40578ccornelius            foundBreaks.push((current+wordLength), status);
60659d709d503bab6e2b61931737e662dd293b40578ccornelius        }
60759d709d503bab6e2b61931737e662dd293b40578ccornelius    }
60859d709d503bab6e2b61931737e662dd293b40578ccornelius
60959d709d503bab6e2b61931737e662dd293b40578ccornelius    // Don't return a break for the end of the dictionary range if there is one there.
61059d709d503bab6e2b61931737e662dd293b40578ccornelius    if (foundBreaks.peeki() >= rangeEnd) {
61159d709d503bab6e2b61931737e662dd293b40578ccornelius        (void) foundBreaks.popi();
61259d709d503bab6e2b61931737e662dd293b40578ccornelius        wordsFound -= 1;
61359d709d503bab6e2b61931737e662dd293b40578ccornelius    }
61459d709d503bab6e2b61931737e662dd293b40578ccornelius
61559d709d503bab6e2b61931737e662dd293b40578ccornelius    return wordsFound;
61659d709d503bab6e2b61931737e662dd293b40578ccornelius}
61759d709d503bab6e2b61931737e662dd293b40578ccornelius
61859d709d503bab6e2b61931737e662dd293b40578ccornelius/*
61959d709d503bab6e2b61931737e662dd293b40578ccornelius ******************************************************************
62059d709d503bab6e2b61931737e662dd293b40578ccornelius * KhmerBreakEngine
62159d709d503bab6e2b61931737e662dd293b40578ccornelius */
62259d709d503bab6e2b61931737e662dd293b40578ccornelius
623b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// How many words in a row are "good enough"?
624b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_LOOKAHEAD 3
625b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
626b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word with a preceding dictionary word longer than this
627b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_ROOT_COMBINE_THRESHOLD 3
628b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
629b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word that shares at least this much prefix with a
630b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// dictionary word, with a preceding word
631b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_PREFIX_COMBINE_THRESHOLD 3
632b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
633b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum word size
634b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD 2
635b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
636b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum number of characters for two words
637b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
638b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
63954dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusKhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
64054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
641b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho      fDictionary(adoptDictionary)
642b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho{
643b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
644b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (U_SUCCESS(status)) {
645b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        setCharacters(fKhmerWordSet);
646b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
647b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
648b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fMarkSet.add(0x0020);
649b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fEndWordSet = fKhmerWordSet;
650b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fBeginWordSet.add(0x1780, 0x17B3);
651b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
652b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
653b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
654b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
655b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
656b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
657b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
658b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
659b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
660b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fSuffixSet.add(THAI_PAIYANNOI);
661b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fSuffixSet.add(THAI_MAIYAMOK);
662b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
663b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Compact for caching.
664b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fMarkSet.compact();
665b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fEndWordSet.compact();
666b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fBeginWordSet.compact();
667b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fSuffixSet.compact();
668b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
669b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
670b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::~KhmerBreakEngine() {
671b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    delete fDictionary;
672b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
673b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
674b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoint32_t
675b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::divideUpDictionaryRange( UText *text,
676b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                int32_t rangeStart,
677b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                int32_t rangeEnd,
678b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                UStack &foundBreaks ) const {
679b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
680b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return 0;       // Not enough characters for two words
681b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
682b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
683b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    uint32_t wordsFound = 0;
684b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t wordLength;
685b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t current;
686b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UErrorCode status = U_ZERO_ERROR;
687b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    PossibleWord words[KHMER_LOOKAHEAD];
688b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UChar32 uc;
689b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
690b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    utext_setNativeIndex(text, rangeStart);
691b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
692b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
693b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        wordLength = 0;
694b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
695b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Look for candidate words at the current position
696b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
697b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
698b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // If we found exactly one, use that
699b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if (candidates == 1) {
700b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
701b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordsFound += 1;
702b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
703b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
704b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // If there was more than one, see which one can take us forward the most words
705b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        else if (candidates > 1) {
706b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // If we're already at the end of the range, we're done
707b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
708b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                goto foundBest;
709b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
710b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            do {
711b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                int wordsMatched = 1;
71254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
713b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if (wordsMatched < 2) {
714b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        // Followed by another dictionary word; mark first word as a good candidate
71554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
716b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        wordsMatched = 2;
717b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
718b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
719b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // If we're already at the end of the range, we're done
720b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
721b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        goto foundBest;
722b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
723b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
724b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // See if any of the possible second words is followed by a third word
725b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    do {
726b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        // If we find a third word, stop right away
72754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
72854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
729b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                            goto foundBest;
730b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        }
731b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
73254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
733b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
734b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
73554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
736b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehofoundBest:
73754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
738b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordsFound += 1;
739b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
740b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
741b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // We come here after having either found a word or not. We look ahead to the
742b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // next word. If it's not a dictionary word, we will combine it with the word we
743b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // just found (if there is one), but only if the preceding word does not exceed
744b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // the threshold.
745b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // The text iterator should now be positioned at the end of the word we found.
746b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
747b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // if it is a dictionary word, do nothing. If it isn't, then if there is
748b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // no preceding word, or the non-word shares less than the minimum threshold
749b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // of characters with a dictionary word, then scan to resynchronize
75054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
751b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  && (wordLength == 0
75254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
753b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Look for a plausible word boundary
754b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                //TODO: This section will need a rework for UText.
755b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                int32_t remaining = rangeEnd - (current+wordLength);
756b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                UChar32 pc = utext_current32(text);
757b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                int32_t chars = 0;
758b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                for (;;) {
759b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    utext_next32(text);
760b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    uc = utext_current32(text);
761b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // TODO: Here we're counting on the fact that the SA languages are all
762b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // in the BMP. This should get fixed with the UText rework.
763b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    chars += 1;
764b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if (--remaining <= 0) {
765b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        break;
766b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
767b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
768b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        // Maybe. See if it's in the dictionary.
76954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                        int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
770b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        utext_setNativeIndex(text, current+wordLength+chars);
771b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        if (candidates > 0) {
772b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                            break;
773b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        }
774b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
775b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    pc = uc;
776b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
777b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
778b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Bump the word count if there wasn't already one
779b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                if (wordLength <= 0) {
780b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    wordsFound += 1;
781b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
782b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
783b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Update the length with the passed-over characters
784b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                wordLength += chars;
785b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
786b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            else {
787b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Back up to where we were for next iteration
788b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                utext_setNativeIndex(text, current+wordLength);
789b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
790b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
791b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
792b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Never stop before a combining mark.
793b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t currPos;
794b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
795b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            utext_next32(text);
796b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
797b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
798b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
799b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Look ahead for possible suffixes if a dictionary word does not follow.
800b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // We do this in code rather than using a rule so that the heuristic
801b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // resynch continues to function. For example, one of the suffix characters
802b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // could be a typo in the middle of a word.
803b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
804b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
805b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                && fSuffixSet.contains(uc = utext_current32(text))) {
806b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                if (uc == KHMER_PAIYANNOI) {
807b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    if (!fSuffixSet.contains(utext_previous32(text))) {
808b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Skip over previous end and PAIYANNOI
809b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
810b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
811b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        wordLength += 1;            // Add PAIYANNOI to word
812b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        uc = utext_current32(text);     // Fetch next character
813b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
814b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    else {
815b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Restore prior position
816b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
817b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
818b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                }
819b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                if (uc == KHMER_MAIYAMOK) {
820b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
821b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Skip over previous end and MAIYAMOK
822b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
823b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
824b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        wordLength += 1;            // Add MAIYAMOK to word
825b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
826b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    else {
827b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Restore prior position
828b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
829b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
830b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                }
831b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            }
832b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            else {
833b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                utext_setNativeIndex(text, current+wordLength);
834b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            }
835b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//        }
836b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
837ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Did we find a word on this iteration? If so, push it on the break stack
838ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (wordLength > 0) {
839ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            foundBreaks.push((current+wordLength), status);
840ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
841ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
842ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
843ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Don't return a break for the end of the dictionary range if there is one there.
844ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (foundBreaks.peeki() >= rangeEnd) {
845ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        (void) foundBreaks.popi();
846ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        wordsFound -= 1;
847ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
848ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
849ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return wordsFound;
850ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
851ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#if !UCONFIG_NO_NORMALIZATION
85354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/*
85454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ******************************************************************
85554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * CjkBreakEngine
85654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */
85754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const uint32_t kuint32max = 0xFFFFFFFF;
85854dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusCjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
85954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
86054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Korean dictionary only includes Hangul syllables
86154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
86254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
86354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
86454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
86554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
86654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (U_SUCCESS(status)) {
86754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // handle Korean and Japanese/Chinese using different dictionaries
86854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (type == kKorean) {
86954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            setCharacters(fHangulWordSet);
87054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        } else { //Chinese and Japanese
87154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            UnicodeSet cjSet;
87254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            cjSet.addAll(fHanWordSet);
87354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            cjSet.addAll(fKatakanaWordSet);
87454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            cjSet.addAll(fHiraganaWordSet);
87559d709d503bab6e2b61931737e662dd293b40578ccornelius            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
87659d709d503bab6e2b61931737e662dd293b40578ccornelius            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
87754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            setCharacters(cjSet);
87854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
87954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
88054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}
88154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
88254dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusCjkBreakEngine::~CjkBreakEngine(){
88354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    delete fDictionary;
88454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}
88554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
88654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// The katakanaCost values below are based on the length frequencies of all
88754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// katakana phrases in the dictionary
88854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const int kMaxKatakanaLength = 8;
88954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const int kMaxKatakanaGroupLength = 20;
89054dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const uint32_t maxSnlp = 255;
89154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
89254dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic inline uint32_t getKatakanaCost(int wordLength){
89354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    //TODO: fill array with actual values from dictionary!
89454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
89554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
89654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
89754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}
89854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
89954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic inline bool isKatakana(uint16_t value) {
90054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
90154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            (value >= 0xFF66u && value <= 0xFF9fu);
90254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}
90354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
90454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// A very simple helper class to streamline the buffer handling in
90554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// divideUpDictionaryRange.
90654dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliustemplate<class T, size_t N>
90754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass AutoBuffer {
90854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic:
90954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
91054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (size > N) {
91154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
91254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            capacity = size;
91354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
91454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
91554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    ~AutoBuffer() {
91654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (buffer != stackBuffer)
91754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            uprv_free(buffer);
91854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
91954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
92054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    T* elems() {
92154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        return buffer;
92254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
92354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
92454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const T& operator[] (size_t i) const {
92554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        return buffer[i];
92654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
92754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
92854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    T& operator[] (size_t i) {
92954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        return buffer[i];
93054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
93154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
93254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // resize without copy
93354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    void resize(size_t size) {
93454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (size <= capacity)
93554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            return;
93654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (buffer != stackBuffer)
93754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            uprv_free(buffer);
93854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
93954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        capacity = size;
94054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
94154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
94254dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusprivate:
94354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    T stackBuffer[N];
94454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    T* buffer;
94554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer();
94654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    size_t capacity;
94754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius};
94854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
94954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
95054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/*
95154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param text A UText representing the text
95254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param rangeStart The start of the range of dictionary characters
95354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param rangeEnd The end of the range of dictionary characters
95454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param foundBreaks Output of C array of int32_t break positions, or 0
95554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @return The number of breaks found
95654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */
95754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusint32_t
95854dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusCjkBreakEngine::divideUpDictionaryRange( UText *text,
95954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        int32_t rangeStart,
96054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        int32_t rangeEnd,
96154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        UStack &foundBreaks ) const {
96254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (rangeStart >= rangeEnd) {
96354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        return 0;
96454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
96554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
96654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const size_t defaultInputLength = 80;
96754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    size_t inputLength = rangeEnd - rangeStart;
96854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Replace by UnicodeString.
96954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<UChar, defaultInputLength> charString(inputLength);
97054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
97154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Normalize the input string and put it in normalizedText.
97254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // The map from the indices of the normalized input to the raw
97354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // input is kept in charPositions.
97454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UErrorCode status = U_ZERO_ERROR;
97554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
97654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (U_FAILURE(status)) {
97754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        return 0;
97854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
97954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
98054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UnicodeString inputString(charString.elems(), inputLength);
98154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Use Normalizer2.
98254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UNormalizationMode norm_mode = UNORM_NFKC;
98354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UBool isNormalized =
98454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
98554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        Normalizer::isNormalized(inputString, norm_mode, status);
98654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
98754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Replace by UVector32.
98854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
98954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int numChars = 0;
99054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UText normalizedText = UTEXT_INITIALIZER;
99154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Needs to be declared here because normalizedText holds onto its buffer.
99254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    UnicodeString normalizedString;
99354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (isNormalized) {
99454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        int32_t index = 0;
99554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        charPositions[0] = 0;
99654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        while(index < inputString.length()) {
99754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            index = inputString.moveIndex32(index, 1);
99854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            charPositions[++numChars] = index;
99954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
100054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        utext_openUnicodeString(&normalizedText, &inputString, &status);
100154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
100254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    else {
100354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
100454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (U_FAILURE(status)) {
100554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            return 0;
100654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
100754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        charPositions.resize(normalizedString.length() + 1);
100854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
100954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        int32_t index = 0;
101054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        charPositions[0] = 0;
101154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        while(index < normalizer.endIndex()){
101254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            /* UChar32 uc = */ normalizer.next();
101354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            charPositions[++numChars] = index = normalizer.getIndex();
101454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
101554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
101654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
101754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
101854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (U_FAILURE(status)) {
101954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        return 0;
102054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
102154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
102254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // From this point on, all the indices refer to the indices of
102354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // the normalized input string.
102454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
102554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // bestSnlp[i] is the snlp of the best segmentation of the first i
102654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // characters in the range to be matched.
102754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Replace by UVector32.
102854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
102954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    bestSnlp[0] = 0;
103054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    for(int i = 1; i <= numChars; i++) {
103154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        bestSnlp[i] = kuint32max;
103254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
103354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
103454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // prev[i] is the index of the last CJK character in the previous word in
103554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // the best segmentation of the first i characters.
103654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Replace by UVector32.
103754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
103854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    for(int i = 0; i <= numChars; i++){
103954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        prev[i] = -1;
104054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
104154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
104254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    const size_t maxWordSize = 20;
104354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Replace both with UVector32.
104454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<int32_t, maxWordSize> values(numChars);
104554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<int32_t, maxWordSize> lengths(numChars);
104654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
104754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Dynamic programming to find the best segmentation.
104854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    bool is_prev_katakana = false;
104954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    for (int32_t i = 0; i < numChars; ++i) {
105054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        //utext_setNativeIndex(text, rangeStart + i);
105154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        utext_setNativeIndex(&normalizedText, i);
105254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (bestSnlp[i] == kuint32max)
105354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            continue;
105454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
105554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        int32_t count;
105654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // limit maximum word length matched to size of current substring
105754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
105854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
105954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
106054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
106154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // if there are no single character matches found in the dictionary
106254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // starting with this charcter, treat character as a 1-character word
106354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // with the highest value possible, i.e. the least likely to occur.
106454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // Exclude Korean characters from this treatment, as they should be left
106554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // together by default.
106654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if((count == 0 || lengths[0] != 1) &&
106754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                !fHangulWordSet.contains(utext_current32(&normalizedText))) {
106854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            values[count] = maxSnlp;
106954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            lengths[count++] = 1;
107054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
107154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
107254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        for (int j = 0; j < count; j++) {
107354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            uint32_t newSnlp = bestSnlp[i] + values[j];
107454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            if (newSnlp < bestSnlp[lengths[j] + i]) {
107554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                bestSnlp[lengths[j] + i] = newSnlp;
107654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                prev[lengths[j] + i] = i;
107754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            }
107854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
107954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
108054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // In Japanese,
108154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // Katakana word in single character is pretty rare. So we apply
108254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // the following heuristic to Katakana: any continuous run of Katakana
108354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // characters is considered a candidate word with a default cost
108454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        // specified in the katakanaCost table according to its length.
108554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        //utext_setNativeIndex(text, rangeStart + i);
108654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        utext_setNativeIndex(&normalizedText, i);
108754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        bool is_katakana = isKatakana(utext_current32(&normalizedText));
108854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        if (!is_prev_katakana && is_katakana) {
108954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            int j = i + 1;
109054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            utext_next32(&normalizedText);
109154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            // Find the end of the continuous run of Katakana characters
109254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
109354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                    isKatakana(utext_current32(&normalizedText))) {
109454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                utext_next32(&normalizedText);
109554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                ++j;
109654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            }
109754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            if ((j - i) < kMaxKatakanaGroupLength) {
109854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
109954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                if (newSnlp < bestSnlp[j]) {
110054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                    bestSnlp[j] = newSnlp;
110154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                    prev[j] = i;
110254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius                }
110354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            }
110454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
110554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        is_prev_katakana = is_katakana;
110654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
110754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
110854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Start pushing the optimal offset index into t_boundary (t for tentative).
110954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // prev[numChars] is guaranteed to be meaningful.
111054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // We'll first push in the reverse order, i.e.,
111154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // t_boundary[0] = numChars, and afterwards do a swap.
111254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // TODO: Replace by UVector32.
111354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
111454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
111554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    int numBreaks = 0;
111654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // No segmentation found, set boundary to end of range
111754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (bestSnlp[numChars] == kuint32max) {
111854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        t_boundary[numBreaks++] = numChars;
111954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    } else {
112054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        for (int i = numChars; i > 0; i = prev[i]) {
112154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius            t_boundary[numBreaks++] = i;
112254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        }
112354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
112454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
112554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
112654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Reverse offset index in t_boundary.
112754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Don't add a break for the start of the dictionary range if there is one
112854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // there already.
112954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
113054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        t_boundary[numBreaks++] = 0;
113154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
113254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
113354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // Now that we're done, convert positions in t_bdry[] (indices in
113454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // the normalized input string) back to indices in the raw input string
113554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    // while reversing t_bdry and pushing values to foundBreaks.
113654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    for (int i = numBreaks-1; i >= 0; i--) {
113754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
113854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    }
113954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
114054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    utext_close(&normalizedText);
114154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    return numBreaks;
114254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}
114354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif
114454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
1145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
1146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
1147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
114854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
1149