1/**
2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation and others. *
4 * All Rights Reserved.                                                        *
5 *******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uniset.h"
15#include "unicode/chariter.h"
16#include "unicode/ubrk.h"
17#include "uvector.h"
18#include "triedict.h"
19
20U_NAMESPACE_BEGIN
21
22/*
23 ******************************************************************
24 */
25
26/*DictionaryBreakEngine::DictionaryBreakEngine() {
27    fTypes = 0;
28}*/
29
30DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
31    fTypes = breakTypes;
32}
33
34DictionaryBreakEngine::~DictionaryBreakEngine() {
35}
36
37UBool
38DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
39    return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
40            && fSet.contains(c));
41}
42
43int32_t
44DictionaryBreakEngine::findBreaks( UText *text,
45                                 int32_t startPos,
46                                 int32_t endPos,
47                                 UBool reverse,
48                                 int32_t breakType,
49                                 UStack &foundBreaks ) const {
50    int32_t result = 0;
51
52    // Find the span of characters included in the set.
53    int32_t start = (int32_t)utext_getNativeIndex(text);
54    int32_t current;
55    int32_t rangeStart;
56    int32_t rangeEnd;
57    UChar32 c = utext_current32(text);
58    if (reverse) {
59        UBool   isDict = fSet.contains(c);
60        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
61            c = utext_previous32(text);
62            isDict = fSet.contains(c);
63        }
64        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
65        rangeEnd = start + 1;
66    }
67    else {
68        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
69            utext_next32(text);         // TODO:  recast loop for postincrement
70            c = utext_current32(text);
71        }
72        rangeStart = start;
73        rangeEnd = current;
74    }
75    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
76        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
77        utext_setNativeIndex(text, current);
78    }
79
80    return result;
81}
82
83void
84DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
85    fSet = set;
86    // Compact for caching
87    fSet.compact();
88}
89
90/*void
91DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
92    fTypes = breakTypes;
93}*/
94
95/*
96 ******************************************************************
97 */
98
99
100// Helper class for improving readability of the Thai word break
101// algorithm. The implementation is completely inline.
102
103// List size, limited by the maximum number of words in the dictionary
104// that form a nested sequence.
105#define POSSIBLE_WORD_LIST_MAX 20
106
107class PossibleWord {
108 private:
109  // list of word candidate lengths, in increasing length order
110  int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
111  int       count;      // Count of candidates
112  int32_t   prefix;     // The longest match with a dictionary word
113  int32_t   offset;     // Offset in the text of these candidates
114  int       mark;       // The preferred candidate's offset
115  int       current;    // The candidate we're currently looking at
116
117 public:
118  PossibleWord();
119  ~PossibleWord();
120
121  // Fill the list of candidates if needed, select the longest, and return the number found
122  int       candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
123
124  // Select the currently marked candidate, point after it in the text, and invalidate self
125  int32_t   acceptMarked( UText *text );
126
127  // Back up from the current candidate to the next shorter one; return TRUE if that exists
128  // and point the text after it
129  UBool     backUp( UText *text );
130
131  // Return the longest prefix this candidate location shares with a dictionary word
132  int32_t   longestPrefix();
133
134  // Mark the current candidate as the one we like
135  void      markCurrent();
136};
137
138inline
139PossibleWord::PossibleWord() {
140    offset = -1;
141}
142
143inline
144PossibleWord::~PossibleWord() {
145}
146
147inline int
148PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
149    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
150    int32_t start = (int32_t)utext_getNativeIndex(text);
151    if (start != offset) {
152        offset = start;
153        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
154        // Dictionary leaves text after longest prefix, not longest word. Back up.
155        if (count <= 0) {
156            utext_setNativeIndex(text, start);
157        }
158    }
159    if (count > 0) {
160        utext_setNativeIndex(text, start+lengths[count-1]);
161    }
162    current = count-1;
163    mark = current;
164    return count;
165}
166
167inline int32_t
168PossibleWord::acceptMarked( UText *text ) {
169    utext_setNativeIndex(text, offset + lengths[mark]);
170    return lengths[mark];
171}
172
173inline UBool
174PossibleWord::backUp( UText *text ) {
175    if (current > 0) {
176        utext_setNativeIndex(text, offset + lengths[--current]);
177        return TRUE;
178    }
179    return FALSE;
180}
181
182inline int32_t
183PossibleWord::longestPrefix() {
184    return prefix;
185}
186
187inline void
188PossibleWord::markCurrent() {
189    mark = current;
190}
191
192// How many words in a row are "good enough"?
193#define THAI_LOOKAHEAD 3
194
195// Will not combine a non-word with a preceding dictionary word longer than this
196#define THAI_ROOT_COMBINE_THRESHOLD 3
197
198// Will not combine a non-word that shares at least this much prefix with a
199// dictionary word, with a preceding word
200#define THAI_PREFIX_COMBINE_THRESHOLD 3
201
202// Ellision character
203#define THAI_PAIYANNOI 0x0E2F
204
205// Repeat character
206#define THAI_MAIYAMOK 0x0E46
207
208// Minimum word size
209#define THAI_MIN_WORD 2
210
211// Minimum number of characters for two words
212#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
213
214ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
215    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
216      fDictionary(adoptDictionary)
217{
218    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
219    if (U_SUCCESS(status)) {
220        setCharacters(fThaiWordSet);
221    }
222    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
223    fMarkSet.add(0x0020);
224    fEndWordSet = fThaiWordSet;
225    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
226    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
227    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
228    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
229    fSuffixSet.add(THAI_PAIYANNOI);
230    fSuffixSet.add(THAI_MAIYAMOK);
231
232    // Compact for caching.
233    fMarkSet.compact();
234    fEndWordSet.compact();
235    fBeginWordSet.compact();
236    fSuffixSet.compact();
237}
238
239ThaiBreakEngine::~ThaiBreakEngine() {
240    delete fDictionary;
241}
242
243int32_t
244ThaiBreakEngine::divideUpDictionaryRange( UText *text,
245                                                int32_t rangeStart,
246                                                int32_t rangeEnd,
247                                                UStack &foundBreaks ) const {
248    if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
249        return 0;       // Not enough characters for two words
250    }
251
252    uint32_t wordsFound = 0;
253    int32_t wordLength;
254    int32_t current;
255    UErrorCode status = U_ZERO_ERROR;
256    PossibleWord words[THAI_LOOKAHEAD];
257    UChar32 uc;
258
259    utext_setNativeIndex(text, rangeStart);
260
261    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
262        wordLength = 0;
263
264        // Look for candidate words at the current position
265        int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
266
267        // If we found exactly one, use that
268        if (candidates == 1) {
269            wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
270            wordsFound += 1;
271        }
272
273        // If there was more than one, see which one can take us forward the most words
274        else if (candidates > 1) {
275            // If we're already at the end of the range, we're done
276            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277                goto foundBest;
278            }
279            do {
280                int wordsMatched = 1;
281                if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
282                    if (wordsMatched < 2) {
283                        // Followed by another dictionary word; mark first word as a good candidate
284                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
285                        wordsMatched = 2;
286                    }
287
288                    // If we're already at the end of the range, we're done
289                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
290                        goto foundBest;
291                    }
292
293                    // See if any of the possible second words is followed by a third word
294                    do {
295                        // If we find a third word, stop right away
296                        if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
297                            words[wordsFound%THAI_LOOKAHEAD].markCurrent();
298                            goto foundBest;
299                        }
300                    }
301                    while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
302                }
303            }
304            while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
305foundBest:
306            wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
307            wordsFound += 1;
308        }
309
310        // We come here after having either found a word or not. We look ahead to the
311        // next word. If it's not a dictionary word, we will combine it withe the word we
312        // just found (if there is one), but only if the preceding word does not exceed
313        // the threshold.
314        // The text iterator should now be positioned at the end of the word we found.
315        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
316            // if it is a dictionary word, do nothing. If it isn't, then if there is
317            // no preceding word, or the non-word shares less than the minimum threshold
318            // of characters with a dictionary word, then scan to resynchronize
319            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
320                  && (wordLength == 0
321                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
322                // Look for a plausible word boundary
323                //TODO: This section will need a rework for UText.
324                int32_t remaining = rangeEnd - (current+wordLength);
325                UChar32 pc = utext_current32(text);
326                int32_t chars = 0;
327                for (;;) {
328                    utext_next32(text);
329                    uc = utext_current32(text);
330                    // TODO: Here we're counting on the fact that the SA languages are all
331                    // in the BMP. This should get fixed with the UText rework.
332                    chars += 1;
333                    if (--remaining <= 0) {
334                        break;
335                    }
336                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
337                        // Maybe. See if it's in the dictionary.
338                        // NOTE: In the original Apple code, checked that the next
339                        // two characters after uc were not 0x0E4C THANTHAKHAT before
340                        // checking the dictionary. That is just a performance filter,
341                        // but it's not clear it's faster than checking the trie.
342                        int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
343                        utext_setNativeIndex(text, current+wordLength+chars);
344                        if (candidates > 0) {
345                            break;
346                        }
347                    }
348                    pc = uc;
349                }
350
351                // Bump the word count if there wasn't already one
352                if (wordLength <= 0) {
353                    wordsFound += 1;
354                }
355
356                // Update the length with the passed-over characters
357                wordLength += chars;
358            }
359            else {
360                // Back up to where we were for next iteration
361                utext_setNativeIndex(text, current+wordLength);
362            }
363        }
364
365        // Never stop before a combining mark.
366        int32_t currPos;
367        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
368            utext_next32(text);
369            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
370        }
371
372        // Look ahead for possible suffixes if a dictionary word does not follow.
373        // We do this in code rather than using a rule so that the heuristic
374        // resynch continues to function. For example, one of the suffix characters
375        // could be a typo in the middle of a word.
376        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
377            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
378                && fSuffixSet.contains(uc = utext_current32(text))) {
379                if (uc == THAI_PAIYANNOI) {
380                    if (!fSuffixSet.contains(utext_previous32(text))) {
381                        // Skip over previous end and PAIYANNOI
382                        utext_next32(text);
383                        utext_next32(text);
384                        wordLength += 1;            // Add PAIYANNOI to word
385                        uc = utext_current32(text);     // Fetch next character
386                    }
387                    else {
388                        // Restore prior position
389                        utext_next32(text);
390                    }
391                }
392                if (uc == THAI_MAIYAMOK) {
393                    if (utext_previous32(text) != THAI_MAIYAMOK) {
394                        // Skip over previous end and MAIYAMOK
395                        utext_next32(text);
396                        utext_next32(text);
397                        wordLength += 1;            // Add MAIYAMOK to word
398                    }
399                    else {
400                        // Restore prior position
401                        utext_next32(text);
402                    }
403                }
404            }
405            else {
406                utext_setNativeIndex(text, current+wordLength);
407            }
408        }
409
410        // Did we find a word on this iteration? If so, push it on the break stack
411        if (wordLength > 0) {
412            foundBreaks.push((current+wordLength), status);
413        }
414    }
415
416    // Don't return a break for the end of the dictionary range if there is one there.
417    if (foundBreaks.peeki() >= rangeEnd) {
418        (void) foundBreaks.popi();
419        wordsFound -= 1;
420    }
421
422    return wordsFound;
423}
424
425U_NAMESPACE_END
426
427#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
428