1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Copyright (C) 2006-2008,2011, International Business Machines Corporation   *
4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and others. All Rights Reserved.                                            *
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *******************************************************************************
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "brkeng.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "dictbe.h"
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/chariter.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ubrk.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "triedict.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*DictionaryBreakEngine::DictionaryBreakEngine() {
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fTypes = 0;
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fTypes = breakTypes;
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::~DictionaryBreakEngine() {
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            && fSet.contains(c));
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::findBreaks( UText *text,
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t startPos,
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t endPos,
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 UBool reverse,
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t breakType,
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 UStack &foundBreaks ) const {
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t result = 0;
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Find the span of characters included in the set.
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t start = (int32_t)utext_getNativeIndex(text);
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t current;
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t rangeStart;
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t rangeEnd;
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 c = utext_current32(text);
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (reverse) {
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UBool   isDict = fSet.contains(c);
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c = utext_previous32(text);
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            isDict = fSet.contains(c);
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeEnd = start + 1;
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else {
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utext_next32(text);         // TODO:  recast loop for postincrement
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            c = utext_current32(text);
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeStart = start;
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        rangeEnd = current;
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utext_setNativeIndex(text, current);
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return result;
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSet = set;
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Compact for caching
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSet.compact();
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*void
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fTypes = breakTypes;
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Helper class for improving readability of the Thai word break
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// algorithm. The implementation is completely inline.
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// List size, limited by the maximum number of words in the dictionary
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that form a nested sequence.
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define POSSIBLE_WORD_LIST_MAX 20
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass PossibleWord {
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru private:
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // list of word candidate lengths, in increasing length order
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int       count;      // Count of candidates
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t   prefix;     // The longest match with a dictionary word
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t   offset;     // Offset in the text of these candidates
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int       mark;       // The preferred candidate's offset
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int       current;    // The candidate we're currently looking at
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public:
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  PossibleWord();
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  ~PossibleWord();
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // Fill the list of candidates if needed, select the longest, and return the number found
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int       candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // Select the currently marked candidate, point after it in the text, and invalidate self
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t   acceptMarked( UText *text );
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // Back up from the current candidate to the next shorter one; return TRUE if that exists
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // and point the text after it
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  UBool     backUp( UText *text );
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // Return the longest prefix this candidate location shares with a dictionary word
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  int32_t   longestPrefix();
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  // Mark the current candidate as the one we like
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru  void      markCurrent();
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::PossibleWord() {
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    offset = -1;
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::~PossibleWord() {
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t start = (int32_t)utext_getNativeIndex(text);
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (start != offset) {
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        offset = start;
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Dictionary leaves text after longest prefix, not longest word. Back up.
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (count <= 0) {
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utext_setNativeIndex(text, start);
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (count > 0) {
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utext_setNativeIndex(text, start+lengths[count-1]);
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    current = count-1;
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    mark = current;
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return count;
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::acceptMarked( UText *text ) {
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utext_setNativeIndex(text, offset + lengths[mark]);
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return lengths[mark];
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline UBool
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::backUp( UText *text ) {
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (current > 0) {
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        utext_setNativeIndex(text, offset + lengths[--current]);
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return TRUE;
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return FALSE;
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::longestPrefix() {
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return prefix;
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline void
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::markCurrent() {
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    mark = current;
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// How many words in a row are "good enough"?
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_LOOKAHEAD 3
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word with a preceding dictionary word longer than this
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_ROOT_COMBINE_THRESHOLD 3
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word that shares at least this much prefix with a
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// dictionary word, with a preceding word
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PREFIX_COMBINE_THRESHOLD 3
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Ellision character
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PAIYANNOI 0x0E2F
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Repeat character
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MAIYAMOK 0x0E46
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum word size
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD 2
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum number of characters for two words
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru      fDictionary(adoptDictionary)
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_SUCCESS(status)) {
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        setCharacters(fThaiWordSet);
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
22385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    fMarkSet.add(0x0020);
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet = fThaiWordSet;
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSuffixSet.add(THAI_PAIYANNOI);
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSuffixSet.add(THAI_MAIYAMOK);
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Compact for caching.
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fMarkSet.compact();
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fEndWordSet.compact();
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fBeginWordSet.compact();
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSuffixSet.compact();
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::~ThaiBreakEngine() {
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fDictionary;
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::divideUpDictionaryRange( UText *text,
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                int32_t rangeStart,
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                int32_t rangeEnd,
247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                UStack &foundBreaks ) const {
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;       // Not enough characters for two words
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint32_t wordsFound = 0;
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t wordLength;
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t current;
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    PossibleWord words[THAI_LOOKAHEAD];
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32 uc;
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    utext_setNativeIndex(text, rangeStart);
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        wordLength = 0;
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Look for candidate words at the current position
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // If we found exactly one, use that
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (candidates == 1) {
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordsFound += 1;
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // If there was more than one, see which one can take us forward the most words
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        else if (candidates > 1) {
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // If we're already at the end of the range, we're done
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                goto foundBest;
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            do {
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int wordsMatched = 1;
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (wordsMatched < 2) {
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Followed by another dictionary word; mark first word as a good candidate
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        wordsMatched = 2;
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // If we're already at the end of the range, we're done
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        goto foundBest;
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // See if any of the possible second words is followed by a third word
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    do {
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // If we find a third word, stop right away
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            words[wordsFound%THAI_LOOKAHEAD].markCurrent();
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            goto foundBest;
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerufoundBest:
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordsFound += 1;
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // We come here after having either found a word or not. We look ahead to the
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // next word. If it's not a dictionary word, we will combine it withe the word we
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // just found (if there is one), but only if the preceding word does not exceed
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // the threshold.
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // The text iterator should now be positioned at the end of the word we found.
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // if it is a dictionary word, do nothing. If it isn't, then if there is
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // no preceding word, or the non-word shares less than the minimum threshold
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // of characters with a dictionary word, then scan to resynchronize
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                  && (wordLength == 0
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Look for a plausible word boundary
323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                //TODO: This section will need a rework for UText.
324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t remaining = rangeEnd - (current+wordLength);
325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                UChar32 pc = utext_current32(text);
326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                int32_t chars = 0;
327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                for (;;) {
328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    utext_next32(text);
329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    uc = utext_current32(text);
330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // TODO: Here we're counting on the fact that the SA languages are all
331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // in the BMP. This should get fixed with the UText rework.
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    chars += 1;
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (--remaining <= 0) {
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        break;
335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Maybe. See if it's in the dictionary.
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // NOTE: In the original Apple code, checked that the next
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // two characters after uc were not 0x0E4C THANTHAKHAT before
340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // checking the dictionary. That is just a performance filter,
341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // but it's not clear it's faster than checking the trie.
342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_setNativeIndex(text, current+wordLength+chars);
344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        if (candidates > 0) {
345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                            break;
346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        }
347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    pc = uc;
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Bump the word count if there wasn't already one
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (wordLength <= 0) {
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    wordsFound += 1;
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Update the length with the passed-over characters
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                wordLength += chars;
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else {
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                // Back up to where we were for next iteration
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                utext_setNativeIndex(text, current+wordLength);
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Never stop before a combining mark.
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t currPos;
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            utext_next32(text);
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Look ahead for possible suffixes if a dictionary word does not follow.
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // We do this in code rather than using a rule so that the heuristic
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // resynch continues to function. For example, one of the suffix characters
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // could be a typo in the middle of a word.
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                && fSuffixSet.contains(uc = utext_current32(text))) {
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (uc == THAI_PAIYANNOI) {
380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (!fSuffixSet.contains(utext_previous32(text))) {
381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Skip over previous end and PAIYANNOI
382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        wordLength += 1;            // Add PAIYANNOI to word
385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        uc = utext_current32(text);     // Fetch next character
386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    else {
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Restore prior position
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if (uc == THAI_MAIYAMOK) {
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    if (utext_previous32(text) != THAI_MAIYAMOK) {
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Skip over previous end and MAIYAMOK
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        wordLength += 1;            // Add MAIYAMOK to word
398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    else {
400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        // Restore prior position
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                        utext_next32(text);
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    }
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            else {
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                utext_setNativeIndex(text, current+wordLength);
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
409b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Did we find a word on this iteration? If so, push it on the break stack
411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if (wordLength > 0) {
412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            foundBreaks.push((current+wordLength), status);
413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Don't return a break for the end of the dictionary range if there is one there.
417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (foundBreaks.peeki() >= rangeEnd) {
418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        (void) foundBreaks.popi();
419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        wordsFound -= 1;
420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    return wordsFound;
423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// How many words in a row are "good enough"?
426b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_LOOKAHEAD 3
427b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
428b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word with a preceding dictionary word longer than this
429b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_ROOT_COMBINE_THRESHOLD 3
430b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
431b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word that shares at least this much prefix with a
432b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// dictionary word, with a preceding word
433b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_PREFIX_COMBINE_THRESHOLD 3
434b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
435b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum word size
436b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD 2
437b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
438b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum number of characters for two words
439b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
440b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
442b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
443b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho      fDictionary(adoptDictionary)
444b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho{
445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if (U_SUCCESS(status)) {
447b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        setCharacters(fKhmerWordSet);
448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fMarkSet.add(0x0020);
451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fEndWordSet = fKhmerWordSet;
452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fBeginWordSet.add(0x1780, 0x17B3);
453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fSuffixSet.add(THAI_PAIYANNOI);
463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fSuffixSet.add(THAI_MAIYAMOK);
464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
465b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Compact for caching.
466b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fMarkSet.compact();
467b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fEndWordSet.compact();
468b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    fBeginWordSet.compact();
469b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//    fSuffixSet.compact();
470b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
471b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
472b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::~KhmerBreakEngine() {
473b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    delete fDictionary;
474b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}
475b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
476b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoint32_t
477b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::divideUpDictionaryRange( UText *text,
478b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                int32_t rangeStart,
479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                int32_t rangeEnd,
480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                UStack &foundBreaks ) const {
481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        return 0;       // Not enough characters for two words
483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    }
484b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
485b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    uint32_t wordsFound = 0;
486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t wordLength;
487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t current;
488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UErrorCode status = U_ZERO_ERROR;
489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    PossibleWord words[KHMER_LOOKAHEAD];
490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UChar32 uc;
491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    utext_setNativeIndex(text, rangeStart);
493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        wordLength = 0;
496b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Look for candidate words at the current position
498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // If we found exactly one, use that
501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if (candidates == 1) {
502b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
503b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordsFound += 1;
504b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
505b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
506b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // If there was more than one, see which one can take us forward the most words
507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        else if (candidates > 1) {
508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // If we're already at the end of the range, we're done
509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                goto foundBest;
511b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
512b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            do {
513b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                int wordsMatched = 1;
514b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
515b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if (wordsMatched < 2) {
516b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        // Followed by another dictionary word; mark first word as a good candidate
517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
518b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        wordsMatched = 2;
519b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
520b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
521b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // If we're already at the end of the range, we're done
522b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
523b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        goto foundBest;
524b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
525b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
526b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // See if any of the possible second words is followed by a third word
527b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    do {
528b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        // If we find a third word, stop right away
529b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
530b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                            words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
531b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                            goto foundBest;
532b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        }
533b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
534b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text));
535b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
536b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
537b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text));
538b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehofoundBest:
539b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
540b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordsFound += 1;
541b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
542b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
543b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // We come here after having either found a word or not. We look ahead to the
544b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // next word. If it's not a dictionary word, we will combine it with the word we
545b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // just found (if there is one), but only if the preceding word does not exceed
546b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // the threshold.
547b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // The text iterator should now be positioned at the end of the word we found.
548b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
549b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // if it is a dictionary word, do nothing. If it isn't, then if there is
550b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // no preceding word, or the non-word shares less than the minimum threshold
551b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // of characters with a dictionary word, then scan to resynchronize
552b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
553b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  && (wordLength == 0
554b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                      || words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
555b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Look for a plausible word boundary
556b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                //TODO: This section will need a rework for UText.
557b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                int32_t remaining = rangeEnd - (current+wordLength);
558b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                UChar32 pc = utext_current32(text);
559b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                int32_t chars = 0;
560b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                for (;;) {
561b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    utext_next32(text);
562b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    uc = utext_current32(text);
563b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // TODO: Here we're counting on the fact that the SA languages are all
564b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    // in the BMP. This should get fixed with the UText rework.
565b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    chars += 1;
566b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if (--remaining <= 0) {
567b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        break;
568b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
569b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
570b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        // Maybe. See if it's in the dictionary.
571b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
572b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        utext_setNativeIndex(text, current+wordLength+chars);
573b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        if (candidates > 0) {
574b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                            break;
575b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        }
576b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    }
577b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    pc = uc;
578b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
579b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
580b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Bump the word count if there wasn't already one
581b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                if (wordLength <= 0) {
582b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                    wordsFound += 1;
583b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                }
584b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
585b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Update the length with the passed-over characters
586b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                wordLength += chars;
587b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
588b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            else {
589b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                // Back up to where we were for next iteration
590b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                utext_setNativeIndex(text, current+wordLength);
591b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
592b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
593b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
594b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Never stop before a combining mark.
595b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t currPos;
596b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
597b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            utext_next32(text);
598b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
599b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
600b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
601b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Look ahead for possible suffixes if a dictionary word does not follow.
602b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // We do this in code rather than using a rule so that the heuristic
603b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // resynch continues to function. For example, one of the suffix characters
604b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // could be a typo in the middle of a word.
605b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
606b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
607b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                && fSuffixSet.contains(uc = utext_current32(text))) {
608b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                if (uc == KHMER_PAIYANNOI) {
609b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    if (!fSuffixSet.contains(utext_previous32(text))) {
610b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Skip over previous end and PAIYANNOI
611b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
612b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
613b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        wordLength += 1;            // Add PAIYANNOI to word
614b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        uc = utext_current32(text);     // Fetch next character
615b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
616b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    else {
617b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Restore prior position
618b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
619b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
620b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                }
621b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                if (uc == KHMER_MAIYAMOK) {
622b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
623b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Skip over previous end and MAIYAMOK
624b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
625b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
626b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        wordLength += 1;            // Add MAIYAMOK to word
627b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
628b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    else {
629b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        // Restore prior position
630b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                        utext_next32(text);
631b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                    }
632b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                }
633b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            }
634b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            else {
635b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//                utext_setNativeIndex(text, current+wordLength);
636b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//            }
637b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho//        }
638b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Did we find a word on this iteration? If so, push it on the break stack
640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (wordLength > 0) {
641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            foundBreaks.push((current+wordLength), status);
642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Don't return a break for the end of the dictionary range if there is one there.
646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (foundBreaks.peeki() >= rangeEnd) {
647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        (void) foundBreaks.popi();
648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        wordsFound -= 1;
649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return wordsFound;
652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
657