1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Copyright (C) 2006-2014, International Business Machines Corporation 454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "brkeng.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "dictbe.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/chariter.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ubrk.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 1854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "uassert.h" 1954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/normlzr.h" 2054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "cmemory.h" 2154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "dictionarydata.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************** 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fTypes = breakTypes; 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::~DictionaryBreakEngine() { 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && fSet.contains(c)); 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::findBreaks( UText *text, 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t startPos, 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t endPos, 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool reverse, 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t breakType, 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const { 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t result = 0; 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Find the span of characters included in the set. 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The span to break begins at the current position in the text, and 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // extends towards the start or end of the text, depending on 'reverse'. 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start = (int32_t)utext_getNativeIndex(text); 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t current; 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeStart; 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeEnd; 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c = utext_current32(text); 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (reverse) { 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool isDict = fSet.contains(c); 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = utext_previous32(text); 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru isDict = fSet.contains(c); 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeEnd = start + 1; 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); // TODO: recast loop for postincrement 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = utext_current32(text); 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeStart = start; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeEnd = current; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current); 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSet = set; 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Compact for caching 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSet.compact(); 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************** 9459d709d503bab6e2b61931737e662dd293b40578ccornelius * PossibleWord 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 9759d709d503bab6e2b61931737e662dd293b40578ccornelius// Helper class for improving readability of the Thai/Lao/Khmer word break 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// algorithm. The implementation is completely inline. 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// List size, limited by the maximum number of words in the dictionary 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that form a nested sequence. 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define POSSIBLE_WORD_LIST_MAX 20 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass PossibleWord { 10554dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusprivate: 10654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // list of word candidate lengths, in increasing length order 10754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 10854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t count; // Count of candidates 10954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t prefix; // The longest match with a dictionary word 11054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t offset; // Offset in the text of these candidates 11154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int mark; // The preferred candidate's offset 11254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int current; // The candidate we're currently looking at 11354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 11454dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic: 11554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius PossibleWord(); 11654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ~PossibleWord(); 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 11854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Fill the list of candidates if needed, select the longest, and return the number found 11954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Select the currently marked candidate, point after it in the text, and invalidate self 12254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t acceptMarked( UText *text ); 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Back up from the current candidate to the next shorter one; return TRUE if that exists 12554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // and point the text after it 12654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool backUp( UText *text ); 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Return the longest prefix this candidate location shares with a dictionary word 12954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t longestPrefix(); 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 13154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Mark the current candidate as the one we like 13254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius void markCurrent(); 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::PossibleWord() { 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset = -1; 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::~PossibleWord() { 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int 14554dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusPossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start = (int32_t)utext_getNativeIndex(text); 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (start != offset) { 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset = start; 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Dictionary leaves text after longest prefix, not longest word. Back up. 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (count <= 0) { 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, start); 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (count > 0) { 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, start+lengths[count-1]); 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru current = count-1; 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru mark = current; 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return count; 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::acceptMarked( UText *text ) { 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, offset + lengths[mark]); 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return lengths[mark]; 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline UBool 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::backUp( UText *text ) { 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (current > 0) { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, offset + lengths[--current]); 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::longestPrefix() { 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prefix; 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline void 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::markCurrent() { 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru mark = current; 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18959d709d503bab6e2b61931737e662dd293b40578ccornelius/* 19059d709d503bab6e2b61931737e662dd293b40578ccornelius ****************************************************************** 19159d709d503bab6e2b61931737e662dd293b40578ccornelius * ThaiBreakEngine 19259d709d503bab6e2b61931737e662dd293b40578ccornelius */ 19359d709d503bab6e2b61931737e662dd293b40578ccornelius 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// How many words in a row are "good enough"? 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_LOOKAHEAD 3 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word with a preceding dictionary word longer than this 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_ROOT_COMBINE_THRESHOLD 3 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word that shares at least this much prefix with a 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// dictionary word, with a preceding word 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PREFIX_COMBINE_THRESHOLD 3 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Ellision character 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PAIYANNOI 0x0E2F 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Repeat character 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MAIYAMOK 0x0E46 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum word size 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD 2 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum number of characters for two words 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21654dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDictionary(adoptDictionary) 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_SUCCESS(status)) { 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setCharacters(fThaiWordSet); 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 22585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fMarkSet.add(0x0020); 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet = fThaiWordSet; 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSuffixSet.add(THAI_PAIYANNOI); 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSuffixSet.add(THAI_MAIYAMOK); 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Compact for caching. 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fMarkSet.compact(); 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet.compact(); 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBeginWordSet.compact(); 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSuffixSet.compact(); 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::~ThaiBreakEngine() { 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fDictionary; 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::divideUpDictionaryRange( UText *text, 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeStart, 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeEnd, 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const { 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Not enough characters for two words 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t wordsFound = 0; 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t wordLength; 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t current; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru PossibleWord words[THAI_LOOKAHEAD]; 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 uc; 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, rangeStart); 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength = 0; 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for candidate words at the current position 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we found exactly one, use that 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (candidates == 1) { 27154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound += 1; 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If there was more than one, see which one can take us forward the most words 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if (candidates > 1) { 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we're already at the end of the range, we're done 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru goto foundBest; 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int wordsMatched = 1; 28254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (wordsMatched < 2) { 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Followed by another dictionary word; mark first word as a good candidate 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsMatched = 2; 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we're already at the end of the range, we're done 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru goto foundBest; 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // See if any of the possible second words is followed by a third word 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we find a third word, stop right away 29754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 29854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru goto foundBest; 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 30254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 30554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerufoundBest: 30754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound += 1; 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // We come here after having either found a word or not. We look ahead to the 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // next word. If it's not a dictionary word, we will combine it withe the word we 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // just found (if there is one), but only if the preceding word does not exceed 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the threshold. 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The text iterator should now be positioned at the end of the word we found. 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if it is a dictionary word, do nothing. If it isn't, then if there is 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // no preceding word, or the non-word shares less than the minimum threshold 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // of characters with a dictionary word, then scan to resynchronize 32054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && (wordLength == 0 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for a plausible word boundary 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //TODO: This section will need a rework for UText. 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t remaining = rangeEnd - (current+wordLength); 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 pc = utext_current32(text); 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t chars = 0; 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (;;) { 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uc = utext_current32(text); 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // TODO: Here we're counting on the fact that the SA languages are all 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // in the BMP. This should get fixed with the UText rework. 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru chars += 1; 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (--remaining <= 0) { 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Maybe. See if it's in the dictionary. 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // NOTE: In the original Apple code, checked that the next 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // two characters after uc were not 0x0E4C THANTHAKHAT before 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // checking the dictionary. That is just a performance filter, 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // but it's not clear it's faster than checking the trie. 34354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 34454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_setNativeIndex(text, current + wordLength + chars); 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (candidates > 0) { 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pc = uc; 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Bump the word count if there wasn't already one 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (wordLength <= 0) { 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound += 1; 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Update the length with the passed-over characters 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += chars; 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Back up to where we were for next iteration 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current+wordLength); 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Never stop before a combining mark. 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t currPos; 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look ahead for possible suffixes if a dictionary word does not follow. 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // We do this in code rather than using a rule so that the heuristic 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // resynch continues to function. For example, one of the suffix characters 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // could be a typo in the middle of a word. 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && fSuffixSet.contains(uc = utext_current32(text))) { 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (uc == THAI_PAIYANNOI) { 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (!fSuffixSet.contains(utext_previous32(text))) { 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Skip over previous end and PAIYANNOI 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += 1; // Add PAIYANNOI to word 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uc = utext_current32(text); // Fetch next character 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Restore prior position 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (uc == THAI_MAIYAMOK) { 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (utext_previous32(text) != THAI_MAIYAMOK) { 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Skip over previous end and MAIYAMOK 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += 1; // Add MAIYAMOK to word 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Restore prior position 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current+wordLength); 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Did we find a word on this iteration? If so, push it on the break stack 412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (wordLength > 0) { 413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho foundBreaks.push((current+wordLength), status); 414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Don't return a break for the end of the dictionary range if there is one there. 418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (foundBreaks.peeki() >= rangeEnd) { 419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho (void) foundBreaks.popi(); 420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound -= 1; 421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return wordsFound; 424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 42659d709d503bab6e2b61931737e662dd293b40578ccornelius/* 42759d709d503bab6e2b61931737e662dd293b40578ccornelius ****************************************************************** 42859d709d503bab6e2b61931737e662dd293b40578ccornelius * LaoBreakEngine 42959d709d503bab6e2b61931737e662dd293b40578ccornelius */ 43059d709d503bab6e2b61931737e662dd293b40578ccornelius 43159d709d503bab6e2b61931737e662dd293b40578ccornelius// How many words in a row are "good enough"? 43259d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_LOOKAHEAD 3 43359d709d503bab6e2b61931737e662dd293b40578ccornelius 43459d709d503bab6e2b61931737e662dd293b40578ccornelius// Will not combine a non-word with a preceding dictionary word longer than this 43559d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_ROOT_COMBINE_THRESHOLD 3 43659d709d503bab6e2b61931737e662dd293b40578ccornelius 43759d709d503bab6e2b61931737e662dd293b40578ccornelius// Will not combine a non-word that shares at least this much prefix with a 43859d709d503bab6e2b61931737e662dd293b40578ccornelius// dictionary word, with a preceding word 43959d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_PREFIX_COMBINE_THRESHOLD 3 44059d709d503bab6e2b61931737e662dd293b40578ccornelius 44159d709d503bab6e2b61931737e662dd293b40578ccornelius// Minimum word size 44259d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_MIN_WORD 2 44359d709d503bab6e2b61931737e662dd293b40578ccornelius 44459d709d503bab6e2b61931737e662dd293b40578ccornelius// Minimum number of characters for two words 44559d709d503bab6e2b61931737e662dd293b40578ccornelius#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2) 44659d709d503bab6e2b61931737e662dd293b40578ccornelius 44759d709d503bab6e2b61931737e662dd293b40578ccorneliusLaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 44859d709d503bab6e2b61931737e662dd293b40578ccornelius : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 44959d709d503bab6e2b61931737e662dd293b40578ccornelius fDictionary(adoptDictionary) 45059d709d503bab6e2b61931737e662dd293b40578ccornelius{ 45159d709d503bab6e2b61931737e662dd293b40578ccornelius fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 45259d709d503bab6e2b61931737e662dd293b40578ccornelius if (U_SUCCESS(status)) { 45359d709d503bab6e2b61931737e662dd293b40578ccornelius setCharacters(fLaoWordSet); 45459d709d503bab6e2b61931737e662dd293b40578ccornelius } 45559d709d503bab6e2b61931737e662dd293b40578ccornelius fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 45659d709d503bab6e2b61931737e662dd293b40578ccornelius fMarkSet.add(0x0020); 45759d709d503bab6e2b61931737e662dd293b40578ccornelius fEndWordSet = fLaoWordSet; 45859d709d503bab6e2b61931737e662dd293b40578ccornelius fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 45959d709d503bab6e2b61931737e662dd293b40578ccornelius fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 46059d709d503bab6e2b61931737e662dd293b40578ccornelius fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 46159d709d503bab6e2b61931737e662dd293b40578ccornelius fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 46259d709d503bab6e2b61931737e662dd293b40578ccornelius 46359d709d503bab6e2b61931737e662dd293b40578ccornelius // Compact for caching. 46459d709d503bab6e2b61931737e662dd293b40578ccornelius fMarkSet.compact(); 46559d709d503bab6e2b61931737e662dd293b40578ccornelius fEndWordSet.compact(); 46659d709d503bab6e2b61931737e662dd293b40578ccornelius fBeginWordSet.compact(); 46759d709d503bab6e2b61931737e662dd293b40578ccornelius} 46859d709d503bab6e2b61931737e662dd293b40578ccornelius 46959d709d503bab6e2b61931737e662dd293b40578ccorneliusLaoBreakEngine::~LaoBreakEngine() { 47059d709d503bab6e2b61931737e662dd293b40578ccornelius delete fDictionary; 47159d709d503bab6e2b61931737e662dd293b40578ccornelius} 47259d709d503bab6e2b61931737e662dd293b40578ccornelius 47359d709d503bab6e2b61931737e662dd293b40578ccorneliusint32_t 47459d709d503bab6e2b61931737e662dd293b40578ccorneliusLaoBreakEngine::divideUpDictionaryRange( UText *text, 47559d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t rangeStart, 47659d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t rangeEnd, 47759d709d503bab6e2b61931737e662dd293b40578ccornelius UStack &foundBreaks ) const { 47859d709d503bab6e2b61931737e662dd293b40578ccornelius if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 47959d709d503bab6e2b61931737e662dd293b40578ccornelius return 0; // Not enough characters for two words 48059d709d503bab6e2b61931737e662dd293b40578ccornelius } 48159d709d503bab6e2b61931737e662dd293b40578ccornelius 48259d709d503bab6e2b61931737e662dd293b40578ccornelius uint32_t wordsFound = 0; 48359d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t wordLength; 48459d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t current; 48559d709d503bab6e2b61931737e662dd293b40578ccornelius UErrorCode status = U_ZERO_ERROR; 48659d709d503bab6e2b61931737e662dd293b40578ccornelius PossibleWord words[LAO_LOOKAHEAD]; 48759d709d503bab6e2b61931737e662dd293b40578ccornelius UChar32 uc; 48859d709d503bab6e2b61931737e662dd293b40578ccornelius 48959d709d503bab6e2b61931737e662dd293b40578ccornelius utext_setNativeIndex(text, rangeStart); 49059d709d503bab6e2b61931737e662dd293b40578ccornelius 49159d709d503bab6e2b61931737e662dd293b40578ccornelius while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 49259d709d503bab6e2b61931737e662dd293b40578ccornelius wordLength = 0; 49359d709d503bab6e2b61931737e662dd293b40578ccornelius 49459d709d503bab6e2b61931737e662dd293b40578ccornelius // Look for candidate words at the current position 49559d709d503bab6e2b61931737e662dd293b40578ccornelius int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 49659d709d503bab6e2b61931737e662dd293b40578ccornelius 49759d709d503bab6e2b61931737e662dd293b40578ccornelius // If we found exactly one, use that 49859d709d503bab6e2b61931737e662dd293b40578ccornelius if (candidates == 1) { 49959d709d503bab6e2b61931737e662dd293b40578ccornelius wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 50059d709d503bab6e2b61931737e662dd293b40578ccornelius wordsFound += 1; 50159d709d503bab6e2b61931737e662dd293b40578ccornelius } 50259d709d503bab6e2b61931737e662dd293b40578ccornelius // If there was more than one, see which one can take us forward the most words 50359d709d503bab6e2b61931737e662dd293b40578ccornelius else if (candidates > 1) { 50459d709d503bab6e2b61931737e662dd293b40578ccornelius // If we're already at the end of the range, we're done 50559d709d503bab6e2b61931737e662dd293b40578ccornelius if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 50659d709d503bab6e2b61931737e662dd293b40578ccornelius goto foundBest; 50759d709d503bab6e2b61931737e662dd293b40578ccornelius } 50859d709d503bab6e2b61931737e662dd293b40578ccornelius do { 50959d709d503bab6e2b61931737e662dd293b40578ccornelius int wordsMatched = 1; 51059d709d503bab6e2b61931737e662dd293b40578ccornelius if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 51159d709d503bab6e2b61931737e662dd293b40578ccornelius if (wordsMatched < 2) { 51259d709d503bab6e2b61931737e662dd293b40578ccornelius // Followed by another dictionary word; mark first word as a good candidate 51359d709d503bab6e2b61931737e662dd293b40578ccornelius words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 51459d709d503bab6e2b61931737e662dd293b40578ccornelius wordsMatched = 2; 51559d709d503bab6e2b61931737e662dd293b40578ccornelius } 51659d709d503bab6e2b61931737e662dd293b40578ccornelius 51759d709d503bab6e2b61931737e662dd293b40578ccornelius // If we're already at the end of the range, we're done 51859d709d503bab6e2b61931737e662dd293b40578ccornelius if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 51959d709d503bab6e2b61931737e662dd293b40578ccornelius goto foundBest; 52059d709d503bab6e2b61931737e662dd293b40578ccornelius } 52159d709d503bab6e2b61931737e662dd293b40578ccornelius 52259d709d503bab6e2b61931737e662dd293b40578ccornelius // See if any of the possible second words is followed by a third word 52359d709d503bab6e2b61931737e662dd293b40578ccornelius do { 52459d709d503bab6e2b61931737e662dd293b40578ccornelius // If we find a third word, stop right away 52559d709d503bab6e2b61931737e662dd293b40578ccornelius if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 52659d709d503bab6e2b61931737e662dd293b40578ccornelius words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 52759d709d503bab6e2b61931737e662dd293b40578ccornelius goto foundBest; 52859d709d503bab6e2b61931737e662dd293b40578ccornelius } 52959d709d503bab6e2b61931737e662dd293b40578ccornelius } 53059d709d503bab6e2b61931737e662dd293b40578ccornelius while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 53159d709d503bab6e2b61931737e662dd293b40578ccornelius } 53259d709d503bab6e2b61931737e662dd293b40578ccornelius } 53359d709d503bab6e2b61931737e662dd293b40578ccornelius while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 53459d709d503bab6e2b61931737e662dd293b40578ccorneliusfoundBest: 53559d709d503bab6e2b61931737e662dd293b40578ccornelius wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 53659d709d503bab6e2b61931737e662dd293b40578ccornelius wordsFound += 1; 53759d709d503bab6e2b61931737e662dd293b40578ccornelius } 53859d709d503bab6e2b61931737e662dd293b40578ccornelius 53959d709d503bab6e2b61931737e662dd293b40578ccornelius // We come here after having either found a word or not. We look ahead to the 54059d709d503bab6e2b61931737e662dd293b40578ccornelius // next word. If it's not a dictionary word, we will combine it withe the word we 54159d709d503bab6e2b61931737e662dd293b40578ccornelius // just found (if there is one), but only if the preceding word does not exceed 54259d709d503bab6e2b61931737e662dd293b40578ccornelius // the threshold. 54359d709d503bab6e2b61931737e662dd293b40578ccornelius // The text iterator should now be positioned at the end of the word we found. 54459d709d503bab6e2b61931737e662dd293b40578ccornelius if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) { 54559d709d503bab6e2b61931737e662dd293b40578ccornelius // if it is a dictionary word, do nothing. If it isn't, then if there is 54659d709d503bab6e2b61931737e662dd293b40578ccornelius // no preceding word, or the non-word shares less than the minimum threshold 54759d709d503bab6e2b61931737e662dd293b40578ccornelius // of characters with a dictionary word, then scan to resynchronize 54859d709d503bab6e2b61931737e662dd293b40578ccornelius if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 54959d709d503bab6e2b61931737e662dd293b40578ccornelius && (wordLength == 0 55059d709d503bab6e2b61931737e662dd293b40578ccornelius || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 55159d709d503bab6e2b61931737e662dd293b40578ccornelius // Look for a plausible word boundary 55259d709d503bab6e2b61931737e662dd293b40578ccornelius //TODO: This section will need a rework for UText. 55359d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t remaining = rangeEnd - (current+wordLength); 55459d709d503bab6e2b61931737e662dd293b40578ccornelius UChar32 pc = utext_current32(text); 55559d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t chars = 0; 55659d709d503bab6e2b61931737e662dd293b40578ccornelius for (;;) { 55759d709d503bab6e2b61931737e662dd293b40578ccornelius utext_next32(text); 55859d709d503bab6e2b61931737e662dd293b40578ccornelius uc = utext_current32(text); 55959d709d503bab6e2b61931737e662dd293b40578ccornelius // TODO: Here we're counting on the fact that the SA languages are all 56059d709d503bab6e2b61931737e662dd293b40578ccornelius // in the BMP. This should get fixed with the UText rework. 56159d709d503bab6e2b61931737e662dd293b40578ccornelius chars += 1; 56259d709d503bab6e2b61931737e662dd293b40578ccornelius if (--remaining <= 0) { 56359d709d503bab6e2b61931737e662dd293b40578ccornelius break; 56459d709d503bab6e2b61931737e662dd293b40578ccornelius } 56559d709d503bab6e2b61931737e662dd293b40578ccornelius if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 56659d709d503bab6e2b61931737e662dd293b40578ccornelius // Maybe. See if it's in the dictionary. 56759d709d503bab6e2b61931737e662dd293b40578ccornelius int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 56859d709d503bab6e2b61931737e662dd293b40578ccornelius utext_setNativeIndex(text, current + wordLength + chars); 56959d709d503bab6e2b61931737e662dd293b40578ccornelius if (candidates > 0) { 57059d709d503bab6e2b61931737e662dd293b40578ccornelius break; 57159d709d503bab6e2b61931737e662dd293b40578ccornelius } 57259d709d503bab6e2b61931737e662dd293b40578ccornelius } 57359d709d503bab6e2b61931737e662dd293b40578ccornelius pc = uc; 57459d709d503bab6e2b61931737e662dd293b40578ccornelius } 57559d709d503bab6e2b61931737e662dd293b40578ccornelius 57659d709d503bab6e2b61931737e662dd293b40578ccornelius // Bump the word count if there wasn't already one 57759d709d503bab6e2b61931737e662dd293b40578ccornelius if (wordLength <= 0) { 57859d709d503bab6e2b61931737e662dd293b40578ccornelius wordsFound += 1; 57959d709d503bab6e2b61931737e662dd293b40578ccornelius } 58059d709d503bab6e2b61931737e662dd293b40578ccornelius 58159d709d503bab6e2b61931737e662dd293b40578ccornelius // Update the length with the passed-over characters 58259d709d503bab6e2b61931737e662dd293b40578ccornelius wordLength += chars; 58359d709d503bab6e2b61931737e662dd293b40578ccornelius } 58459d709d503bab6e2b61931737e662dd293b40578ccornelius else { 58559d709d503bab6e2b61931737e662dd293b40578ccornelius // Back up to where we were for next iteration 58659d709d503bab6e2b61931737e662dd293b40578ccornelius utext_setNativeIndex(text, current+wordLength); 58759d709d503bab6e2b61931737e662dd293b40578ccornelius } 58859d709d503bab6e2b61931737e662dd293b40578ccornelius } 58959d709d503bab6e2b61931737e662dd293b40578ccornelius 59059d709d503bab6e2b61931737e662dd293b40578ccornelius // Never stop before a combining mark. 59159d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t currPos; 59259d709d503bab6e2b61931737e662dd293b40578ccornelius while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 59359d709d503bab6e2b61931737e662dd293b40578ccornelius utext_next32(text); 59459d709d503bab6e2b61931737e662dd293b40578ccornelius wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 59559d709d503bab6e2b61931737e662dd293b40578ccornelius } 59659d709d503bab6e2b61931737e662dd293b40578ccornelius 59759d709d503bab6e2b61931737e662dd293b40578ccornelius // Look ahead for possible suffixes if a dictionary word does not follow. 59859d709d503bab6e2b61931737e662dd293b40578ccornelius // We do this in code rather than using a rule so that the heuristic 59959d709d503bab6e2b61931737e662dd293b40578ccornelius // resynch continues to function. For example, one of the suffix characters 60059d709d503bab6e2b61931737e662dd293b40578ccornelius // could be a typo in the middle of a word. 60159d709d503bab6e2b61931737e662dd293b40578ccornelius // NOT CURRENTLY APPLICABLE TO LAO 60259d709d503bab6e2b61931737e662dd293b40578ccornelius 60359d709d503bab6e2b61931737e662dd293b40578ccornelius // Did we find a word on this iteration? If so, push it on the break stack 60459d709d503bab6e2b61931737e662dd293b40578ccornelius if (wordLength > 0) { 60559d709d503bab6e2b61931737e662dd293b40578ccornelius foundBreaks.push((current+wordLength), status); 60659d709d503bab6e2b61931737e662dd293b40578ccornelius } 60759d709d503bab6e2b61931737e662dd293b40578ccornelius } 60859d709d503bab6e2b61931737e662dd293b40578ccornelius 60959d709d503bab6e2b61931737e662dd293b40578ccornelius // Don't return a break for the end of the dictionary range if there is one there. 61059d709d503bab6e2b61931737e662dd293b40578ccornelius if (foundBreaks.peeki() >= rangeEnd) { 61159d709d503bab6e2b61931737e662dd293b40578ccornelius (void) foundBreaks.popi(); 61259d709d503bab6e2b61931737e662dd293b40578ccornelius wordsFound -= 1; 61359d709d503bab6e2b61931737e662dd293b40578ccornelius } 61459d709d503bab6e2b61931737e662dd293b40578ccornelius 61559d709d503bab6e2b61931737e662dd293b40578ccornelius return wordsFound; 61659d709d503bab6e2b61931737e662dd293b40578ccornelius} 61759d709d503bab6e2b61931737e662dd293b40578ccornelius 61859d709d503bab6e2b61931737e662dd293b40578ccornelius/* 61959d709d503bab6e2b61931737e662dd293b40578ccornelius ****************************************************************** 62059d709d503bab6e2b61931737e662dd293b40578ccornelius * KhmerBreakEngine 62159d709d503bab6e2b61931737e662dd293b40578ccornelius */ 62259d709d503bab6e2b61931737e662dd293b40578ccornelius 623b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// How many words in a row are "good enough"? 624b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_LOOKAHEAD 3 625b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 626b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word with a preceding dictionary word longer than this 627b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_ROOT_COMBINE_THRESHOLD 3 628b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 629b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word that shares at least this much prefix with a 630b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// dictionary word, with a preceding word 631b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_PREFIX_COMBINE_THRESHOLD 3 632b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 633b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum word size 634b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD 2 635b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 636b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum number of characters for two words 637b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 638b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 63954dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusKhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 64054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 641b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fDictionary(adoptDictionary) 642b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho{ 643b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 644b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_SUCCESS(status)) { 645b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho setCharacters(fKhmerWordSet); 646b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 647b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 648b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fMarkSet.add(0x0020); 649b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fEndWordSet = fKhmerWordSet; 650b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fBeginWordSet.add(0x1780, 0x17B3); 651b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 652b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 653b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 654b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 655b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 656b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 657b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 658b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 659b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 660b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fSuffixSet.add(THAI_PAIYANNOI); 661b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fSuffixSet.add(THAI_MAIYAMOK); 662b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 663b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact for caching. 664b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fMarkSet.compact(); 665b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fEndWordSet.compact(); 666b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fBeginWordSet.compact(); 667b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fSuffixSet.compact(); 668b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 669b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 670b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::~KhmerBreakEngine() { 671b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete fDictionary; 672b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 673b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 674b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoint32_t 675b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::divideUpDictionaryRange( UText *text, 676b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t rangeStart, 677b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t rangeEnd, 678b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStack &foundBreaks ) const { 679b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 680b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0; // Not enough characters for two words 681b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 682b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 683b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uint32_t wordsFound = 0; 684b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t wordLength; 685b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t current; 686b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UErrorCode status = U_ZERO_ERROR; 687b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho PossibleWord words[KHMER_LOOKAHEAD]; 688b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 uc; 689b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 690b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, rangeStart); 691b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 692b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 693b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength = 0; 694b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 695b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look for candidate words at the current position 696b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 697b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 698b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we found exactly one, use that 699b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (candidates == 1) { 700b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 701b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound += 1; 702b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 703b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 704b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If there was more than one, see which one can take us forward the most words 705b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho else if (candidates > 1) { 706b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we're already at the end of the range, we're done 707b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 708b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho goto foundBest; 709b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 710b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho do { 711b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int wordsMatched = 1; 71254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 713b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (wordsMatched < 2) { 714b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Followed by another dictionary word; mark first word as a good candidate 71554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 716b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsMatched = 2; 717b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 718b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 719b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we're already at the end of the range, we're done 720b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 721b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho goto foundBest; 722b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 723b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 724b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // See if any of the possible second words is followed by a third word 725b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho do { 726b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we find a third word, stop right away 72754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 72854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 729b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho goto foundBest; 730b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 731b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 73254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 733b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 734b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 73554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 736b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehofoundBest: 73754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 738b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound += 1; 739b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 740b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 741b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We come here after having either found a word or not. We look ahead to the 742b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // next word. If it's not a dictionary word, we will combine it with the word we 743b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // just found (if there is one), but only if the preceding word does not exceed 744b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the threshold. 745b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The text iterator should now be positioned at the end of the word we found. 746b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 747b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // if it is a dictionary word, do nothing. If it isn't, then if there is 748b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // no preceding word, or the non-word shares less than the minimum threshold 749b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // of characters with a dictionary word, then scan to resynchronize 75054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 751b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho && (wordLength == 0 75254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 753b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look for a plausible word boundary 754b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //TODO: This section will need a rework for UText. 755b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remaining = rangeEnd - (current+wordLength); 756b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 pc = utext_current32(text); 757b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t chars = 0; 758b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for (;;) { 759b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_next32(text); 760b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uc = utext_current32(text); 761b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // TODO: Here we're counting on the fact that the SA languages are all 762b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // in the BMP. This should get fixed with the UText rework. 763b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho chars += 1; 764b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (--remaining <= 0) { 765b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 766b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 767b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 768b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Maybe. See if it's in the dictionary. 76954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 770b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, current+wordLength+chars); 771b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (candidates > 0) { 772b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 773b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 774b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 775b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pc = uc; 776b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 777b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 778b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Bump the word count if there wasn't already one 779b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (wordLength <= 0) { 780b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound += 1; 781b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 782b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 783b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Update the length with the passed-over characters 784b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength += chars; 785b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 786b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho else { 787b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Back up to where we were for next iteration 788b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, current+wordLength); 789b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 790b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 791b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 792b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Never stop before a combining mark. 793b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t currPos; 794b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 795b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_next32(text); 796b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 797b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 798b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 799b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look ahead for possible suffixes if a dictionary word does not follow. 800b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We do this in code rather than using a rule so that the heuristic 801b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // resynch continues to function. For example, one of the suffix characters 802b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // could be a typo in the middle of a word. 803b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 804b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 805b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// && fSuffixSet.contains(uc = utext_current32(text))) { 806b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (uc == KHMER_PAIYANNOI) { 807b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (!fSuffixSet.contains(utext_previous32(text))) { 808b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Skip over previous end and PAIYANNOI 809b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 810b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 811b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// wordLength += 1; // Add PAIYANNOI to word 812b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// uc = utext_current32(text); // Fetch next character 813b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 814b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// else { 815b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Restore prior position 816b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 817b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 818b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 819b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (uc == KHMER_MAIYAMOK) { 820b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (utext_previous32(text) != KHMER_MAIYAMOK) { 821b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Skip over previous end and MAIYAMOK 822b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 823b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 824b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// wordLength += 1; // Add MAIYAMOK to word 825b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 826b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// else { 827b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Restore prior position 828b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 829b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 830b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 831b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 832b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// else { 833b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_setNativeIndex(text, current+wordLength); 834b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 835b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 836b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 837ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Did we find a word on this iteration? If so, push it on the break stack 838ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (wordLength > 0) { 839ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru foundBreaks.push((current+wordLength), status); 840ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 841ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 842ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 843ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Don't return a break for the end of the dictionary range if there is one there. 844ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (foundBreaks.peeki() >= rangeEnd) { 845ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (void) foundBreaks.popi(); 846ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound -= 1; 847ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 848ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 849ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return wordsFound; 850ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 851ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 85254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#if !UCONFIG_NO_NORMALIZATION 85354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/* 85454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ****************************************************************** 85554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * CjkBreakEngine 85654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 85754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const uint32_t kuint32max = 0xFFFFFFFF; 85854dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusCjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 85954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 86054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Korean dictionary only includes Hangul syllables 86154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 86254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 86354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 86454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 86554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 86654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (U_SUCCESS(status)) { 86754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // handle Korean and Japanese/Chinese using different dictionaries 86854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (type == kKorean) { 86954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius setCharacters(fHangulWordSet); 87054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } else { //Chinese and Japanese 87154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeSet cjSet; 87254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius cjSet.addAll(fHanWordSet); 87354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius cjSet.addAll(fKatakanaWordSet); 87454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius cjSet.addAll(fHiraganaWordSet); 87559d709d503bab6e2b61931737e662dd293b40578ccornelius cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 87659d709d503bab6e2b61931737e662dd293b40578ccornelius cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 87754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius setCharacters(cjSet); 87854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 87954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 88054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius} 88154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 88254dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusCjkBreakEngine::~CjkBreakEngine(){ 88354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius delete fDictionary; 88454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius} 88554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 88654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// The katakanaCost values below are based on the length frequencies of all 88754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// katakana phrases in the dictionary 88854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const int kMaxKatakanaLength = 8; 88954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const int kMaxKatakanaGroupLength = 20; 89054dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic const uint32_t maxSnlp = 255; 89154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 89254dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic inline uint32_t getKatakanaCost(int wordLength){ 89354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius //TODO: fill array with actual values from dictionary! 89454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 89554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 89654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 89754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius} 89854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 89954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusstatic inline bool isKatakana(uint16_t value) { 90054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 90154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius (value >= 0xFF66u && value <= 0xFF9fu); 90254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius} 90354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 90454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// A very simple helper class to streamline the buffer handling in 90554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// divideUpDictionaryRange. 90654dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliustemplate<class T, size_t N> 90754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass AutoBuffer { 90854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic: 90954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { 91054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (size > N) { 91154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 91254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius capacity = size; 91354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 91454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 91554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ~AutoBuffer() { 91654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (buffer != stackBuffer) 91754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius uprv_free(buffer); 91854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 91954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 92054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius T* elems() { 92154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return buffer; 92254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 92354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 92454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const T& operator[] (size_t i) const { 92554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return buffer[i]; 92654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 92754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 92854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius T& operator[] (size_t i) { 92954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return buffer[i]; 93054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 93154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 93254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // resize without copy 93354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius void resize(size_t size) { 93454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (size <= capacity) 93554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return; 93654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (buffer != stackBuffer) 93754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius uprv_free(buffer); 93854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 93954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius capacity = size; 94054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 94154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 94254dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusprivate: 94354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius T stackBuffer[N]; 94454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius T* buffer; 94554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer(); 94654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius size_t capacity; 94754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}; 94854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 94954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 95054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/* 95154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param text A UText representing the text 95254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param rangeStart The start of the range of dictionary characters 95354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param rangeEnd The end of the range of dictionary characters 95454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @param foundBreaks Output of C array of int32_t break positions, or 0 95554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * @return The number of breaks found 95654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 95754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusint32_t 95854dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusCjkBreakEngine::divideUpDictionaryRange( UText *text, 95954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t rangeStart, 96054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t rangeEnd, 96154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UStack &foundBreaks ) const { 96254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (rangeStart >= rangeEnd) { 96354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return 0; 96454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 96554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 96654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const size_t defaultInputLength = 80; 96754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius size_t inputLength = rangeEnd - rangeStart; 96854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Replace by UnicodeString. 96954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<UChar, defaultInputLength> charString(inputLength); 97054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 97154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Normalize the input string and put it in normalizedText. 97254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // The map from the indices of the normalized input to the raw 97354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // input is kept in charPositions. 97454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UErrorCode status = U_ZERO_ERROR; 97554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); 97654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (U_FAILURE(status)) { 97754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return 0; 97854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 97954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 98054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeString inputString(charString.elems(), inputLength); 98154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Use Normalizer2. 98254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UNormalizationMode norm_mode = UNORM_NFKC; 98354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UBool isNormalized = 98454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || 98554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius Normalizer::isNormalized(inputString, norm_mode, status); 98654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 98754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Replace by UVector32. 98854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 98954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int numChars = 0; 99054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UText normalizedText = UTEXT_INITIALIZER; 99154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Needs to be declared here because normalizedText holds onto its buffer. 99254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UnicodeString normalizedString; 99354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (isNormalized) { 99454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t index = 0; 99554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius charPositions[0] = 0; 99654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while(index < inputString.length()) { 99754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius index = inputString.moveIndex32(index, 1); 99854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius charPositions[++numChars] = index; 99954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 100054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_openUnicodeString(&normalizedText, &inputString, &status); 100154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 100254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius else { 100354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); 100454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (U_FAILURE(status)) { 100554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return 0; 100654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 100754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius charPositions.resize(normalizedString.length() + 1); 100854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius Normalizer normalizer(charString.elems(), inputLength, norm_mode); 100954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t index = 0; 101054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius charPositions[0] = 0; 101154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while(index < normalizer.endIndex()){ 101254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius /* UChar32 uc = */ normalizer.next(); 101354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius charPositions[++numChars] = index = normalizer.getIndex(); 101454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 101554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_openUnicodeString(&normalizedText, &normalizedString, &status); 101654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 101754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 101854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (U_FAILURE(status)) { 101954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return 0; 102054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 102154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 102254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // From this point on, all the indices refer to the indices of 102354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // the normalized input string. 102454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 102554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // bestSnlp[i] is the snlp of the best segmentation of the first i 102654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // characters in the range to be matched. 102754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Replace by UVector32. 102854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 102954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bestSnlp[0] = 0; 103054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius for(int i = 1; i <= numChars; i++) { 103154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bestSnlp[i] = kuint32max; 103254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 103354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 103454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // prev[i] is the index of the last CJK character in the previous word in 103554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // the best segmentation of the first i characters. 103654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Replace by UVector32. 103754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<int, defaultInputLength> prev(numChars + 1); 103854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius for(int i = 0; i <= numChars; i++){ 103954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius prev[i] = -1; 104054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 104154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 104254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const size_t maxWordSize = 20; 104354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Replace both with UVector32. 104454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<int32_t, maxWordSize> values(numChars); 104554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<int32_t, maxWordSize> lengths(numChars); 104654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 104754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Dynamic programming to find the best segmentation. 104854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bool is_prev_katakana = false; 104954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius for (int32_t i = 0; i < numChars; ++i) { 105054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius //utext_setNativeIndex(text, rangeStart + i); 105154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_setNativeIndex(&normalizedText, i); 105254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (bestSnlp[i] == kuint32max) 105354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius continue; 105454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 105554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t count; 105654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // limit maximum word length matched to size of current substring 105754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); 105854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 105954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 106054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 106154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // if there are no single character matches found in the dictionary 106254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // starting with this charcter, treat character as a 1-character word 106354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // with the highest value possible, i.e. the least likely to occur. 106454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Exclude Korean characters from this treatment, as they should be left 106554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // together by default. 106654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if((count == 0 || lengths[0] != 1) && 106754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius !fHangulWordSet.contains(utext_current32(&normalizedText))) { 106854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius values[count] = maxSnlp; 106954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius lengths[count++] = 1; 107054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 107154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 107254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius for (int j = 0; j < count; j++) { 107354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius uint32_t newSnlp = bestSnlp[i] + values[j]; 107454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (newSnlp < bestSnlp[lengths[j] + i]) { 107554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bestSnlp[lengths[j] + i] = newSnlp; 107654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius prev[lengths[j] + i] = i; 107754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 107854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 107954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 108054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // In Japanese, 108154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Katakana word in single character is pretty rare. So we apply 108254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // the following heuristic to Katakana: any continuous run of Katakana 108354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // characters is considered a candidate word with a default cost 108454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // specified in the katakanaCost table according to its length. 108554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius //utext_setNativeIndex(text, rangeStart + i); 108654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_setNativeIndex(&normalizedText, i); 108754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bool is_katakana = isKatakana(utext_current32(&normalizedText)); 108854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (!is_prev_katakana && is_katakana) { 108954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int j = i + 1; 109054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_next32(&normalizedText); 109154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Find the end of the continuous run of Katakana characters 109254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 109354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius isKatakana(utext_current32(&normalizedText))) { 109454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_next32(&normalizedText); 109554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius ++j; 109654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 109754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if ((j - i) < kMaxKatakanaGroupLength) { 109854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 109954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (newSnlp < bestSnlp[j]) { 110054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius bestSnlp[j] = newSnlp; 110154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius prev[j] = i; 110254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 110354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 110454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 110554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius is_prev_katakana = is_katakana; 110654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 110754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 110854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Start pushing the optimal offset index into t_boundary (t for tentative). 110954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // prev[numChars] is guaranteed to be meaningful. 111054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // We'll first push in the reverse order, i.e., 111154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // t_boundary[0] = numChars, and afterwards do a swap. 111254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // TODO: Replace by UVector32. 111354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); 111454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 111554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int numBreaks = 0; 111654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // No segmentation found, set boundary to end of range 111754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (bestSnlp[numChars] == kuint32max) { 111854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius t_boundary[numBreaks++] = numChars; 111954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } else { 112054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius for (int i = numChars; i > 0; i = prev[i]) { 112154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius t_boundary[numBreaks++] = i; 112254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 112354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); 112454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 112554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 112654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Reverse offset index in t_boundary. 112754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Don't add a break for the start of the dictionary range if there is one 112854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // there already. 112954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 113054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius t_boundary[numBreaks++] = 0; 113154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 113254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 113354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Now that we're done, convert positions in t_bdry[] (indices in 113454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // the normalized input string) back to indices in the raw input string 113554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // while reversing t_bdry and pushing values to foundBreaks. 113654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius for (int i = numBreaks-1; i >= 0; i--) { 113754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 113854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 113954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 114054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius utext_close(&normalizedText); 114154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius return numBreaks; 114254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius} 114354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif 114454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 1145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 1146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 114854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 1149